Merge pull request #695 from epage/ignore

feat(config): Custom ignores
This commit is contained in:
Ed Page 2023-03-22 15:53:24 -05:00 committed by GitHub
commit 5253e5589b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 154 additions and 6 deletions

View file

@ -156,6 +156,7 @@ impl FileArgs {
locale: self.locale, locale: self.locale,
..Default::default() ..Default::default()
}), }),
extend_ignore_re: Default::default(),
} }
} }

View file

@ -268,7 +268,7 @@ impl GlobEngineConfig {
} }
} }
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
//#[serde(deny_unknown_fields)] // Doesn't work with `flatten` //#[serde(deny_unknown_fields)] // Doesn't work with `flatten`
#[serde(default)] #[serde(default)]
#[serde(rename_all = "kebab-case")] #[serde(rename_all = "kebab-case")]
@ -283,6 +283,8 @@ pub struct EngineConfig {
pub tokenizer: Option<TokenizerConfig>, pub tokenizer: Option<TokenizerConfig>,
#[serde(flatten)] #[serde(flatten)]
pub dict: Option<DictConfig>, pub dict: Option<DictConfig>,
#[serde(with = "serde_regex")]
pub extend_ignore_re: Vec<regex::Regex>,
} }
impl EngineConfig { impl EngineConfig {
@ -298,6 +300,7 @@ impl EngineConfig {
.unwrap_or_else(TokenizerConfig::from_defaults), .unwrap_or_else(TokenizerConfig::from_defaults),
), ),
dict: Some(empty.dict.unwrap_or_else(DictConfig::from_defaults)), dict: Some(empty.dict.unwrap_or_else(DictConfig::from_defaults)),
extend_ignore_re: Default::default(),
} }
} }
@ -327,6 +330,8 @@ impl EngineConfig {
let mut dict = Some(dict); let mut dict = Some(dict);
std::mem::swap(&mut dict, &mut self.dict); std::mem::swap(&mut dict, &mut self.dict);
} }
self.extend_ignore_re
.extend(source.extend_ignore_re.iter().cloned());
} }
pub fn binary(&self) -> bool { pub fn binary(&self) -> bool {
@ -340,8 +345,29 @@ impl EngineConfig {
pub fn check_file(&self) -> bool { pub fn check_file(&self) -> bool {
self.check_file.unwrap_or(true) self.check_file.unwrap_or(true)
} }
pub fn extend_ignore_re(&self) -> Box<dyn Iterator<Item = &regex::Regex> + '_> {
Box::new(self.extend_ignore_re.iter())
}
} }
impl PartialEq for EngineConfig {
fn eq(&self, rhs: &Self) -> bool {
self.binary == rhs.binary
&& self.check_filename == rhs.check_filename
&& self.check_file == rhs.check_file
&& self.tokenizer == rhs.tokenizer
&& self.dict == rhs.dict
&& self
.extend_ignore_re
.iter()
.map(|r| r.as_str())
.eq(rhs.extend_ignore_re.iter().map(|r| r.as_str()))
}
}
impl Eq for EngineConfig {}
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
#[serde(default)] #[serde(default)]

View file

@ -48,7 +48,14 @@ impl FileChecker for Typos {
reporter.report(msg.into())?; reporter.report(msg.into())?;
} else { } else {
let mut accum_line_num = AccumulateLineNum::new(); let mut accum_line_num = AccumulateLineNum::new();
let mut ignores: Option<Ignores> = None;
for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) {
if ignores
.get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
.is_ignored(typo.span())
{
continue;
}
let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
let (line, line_offset) = extract_line(&buffer, typo.byte_offset); let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
let msg = report::Typo { let msg = report::Typo {
@ -86,7 +93,14 @@ impl FileChecker for FixTypos {
} else { } else {
let mut fixes = Vec::new(); let mut fixes = Vec::new();
let mut accum_line_num = AccumulateLineNum::new(); let mut accum_line_num = AccumulateLineNum::new();
let mut ignores: Option<Ignores> = None;
for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) {
if ignores
.get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
.is_ignored(typo.span())
{
continue;
}
if is_fixable(&typo) { if is_fixable(&typo) {
fixes.push(typo.into_owned()); fixes.push(typo.into_owned());
} else { } else {
@ -163,7 +177,14 @@ impl FileChecker for DiffTypos {
} else { } else {
let mut fixes = Vec::new(); let mut fixes = Vec::new();
let mut accum_line_num = AccumulateLineNum::new(); let mut accum_line_num = AccumulateLineNum::new();
let mut ignores: Option<Ignores> = None;
for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) {
if ignores
.get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
.is_ignored(typo.span())
{
continue;
}
if is_fixable(&typo) { if is_fixable(&typo) {
fixes.push(typo.into_owned()); fixes.push(typo.into_owned());
} else { } else {
@ -276,7 +297,14 @@ impl FileChecker for Identifiers {
let msg = report::BinaryFile { path }; let msg = report::BinaryFile { path };
reporter.report(msg.into())?; reporter.report(msg.into())?;
} else { } else {
let mut ignores: Option<Ignores> = None;
for word in policy.tokenizer.parse_bytes(&buffer) { for word in policy.tokenizer.parse_bytes(&buffer) {
if ignores
.get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
.is_ignored(word.span())
{
continue;
}
// HACK: Don't look up the line_num per entry to better match the performance // HACK: Don't look up the line_num per entry to better match the performance
// of Typos for comparison purposes. We don't really get much out of it // of Typos for comparison purposes. We don't really get much out of it
// anyway. // anyway.
@ -329,11 +357,18 @@ impl FileChecker for Words {
let msg = report::BinaryFile { path }; let msg = report::BinaryFile { path };
reporter.report(msg.into())?; reporter.report(msg.into())?;
} else { } else {
let mut ignores: Option<Ignores> = None;
for word in policy for word in policy
.tokenizer .tokenizer
.parse_bytes(&buffer) .parse_bytes(&buffer)
.flat_map(|i| i.split()) .flat_map(|i| i.split())
{ {
if ignores
.get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
.is_ignored(word.span())
{
continue;
}
// HACK: Don't look up the line_num per entry to better match the performance // HACK: Don't look up the line_num per entry to better match the performance
// of Typos for comparison purposes. We don't really get much out of it // of Typos for comparison purposes. We don't really get much out of it
// anyway. // anyway.
@ -644,6 +679,33 @@ fn walk_entry(
Ok(()) Ok(())
} }
#[derive(Clone, Debug)]
struct Ignores {
blocks: Vec<std::ops::Range<usize>>,
}
impl Ignores {
fn new(content: &[u8], ignores: &[regex::Regex]) -> Self {
let mut blocks = Vec::new();
if let Ok(content) = std::str::from_utf8(content) {
for ignore in ignores {
for mat in ignore.find_iter(content) {
blocks.push(mat.range());
}
}
}
Self { blocks }
}
fn is_ignored(&self, span: std::ops::Range<usize>) -> bool {
let start = span.start;
let end = span.end.saturating_sub(1);
self.blocks
.iter()
.any(|block| block.contains(&start) || block.contains(&end))
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;

View file

@ -42,6 +42,7 @@ pub struct ConfigEngine<'s> {
walk: Intern<crate::config::Walk>, walk: Intern<crate::config::Walk>,
tokenizer: Intern<typos::tokens::Tokenizer>, tokenizer: Intern<typos::tokens::Tokenizer>,
dict: Intern<crate::dict::Override<'s, 's, crate::dict::BuiltIn>>, dict: Intern<crate::dict::Override<'s, 's, crate::dict::BuiltIn>>,
ignore: Intern<Vec<regex::Regex>>,
} }
impl<'s> ConfigEngine<'s> { impl<'s> ConfigEngine<'s> {
@ -54,6 +55,7 @@ impl<'s> ConfigEngine<'s> {
walk: Default::default(), walk: Default::default(),
tokenizer: Default::default(), tokenizer: Default::default(),
dict: Default::default(), dict: Default::default(),
ignore: Default::default(),
} }
} }
@ -88,7 +90,7 @@ impl<'s> ConfigEngine<'s> {
dir.type_matcher.definitions() dir.type_matcher.definitions()
} }
pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_> { pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_, '_> {
debug_assert!(path.is_absolute(), "{} is not absolute", path.display()); debug_assert!(path.is_absolute(), "{} is not absolute", path.display());
let dir = self.get_dir(path).expect("`walk()` should be called first"); let dir = self.get_dir(path).expect("`walk()` should be called first");
let (file_type, file_config) = dir.get_file_config(path); let (file_type, file_config) = dir.get_file_config(path);
@ -99,6 +101,7 @@ impl<'s> ConfigEngine<'s> {
binary: file_config.binary, binary: file_config.binary,
tokenizer: self.get_tokenizer(&file_config), tokenizer: self.get_tokenizer(&file_config),
dict: self.get_dict(&file_config), dict: self.get_dict(&file_config),
ignore: self.get_ignore(&file_config),
} }
} }
@ -114,6 +117,10 @@ impl<'s> ConfigEngine<'s> {
self.dict.get(file.dict) self.dict.get(file.dict)
} }
fn get_ignore(&self, file: &FileConfig) -> &[regex::Regex] {
self.ignore.get(file.ignore)
}
fn get_dir(&self, path: &std::path::Path) -> Option<&DirConfig> { fn get_dir(&self, path: &std::path::Path) -> Option<&DirConfig> {
for path in path.ancestors() { for path in path.ancestors() {
if let Some(dir) = self.configs.get(path) { if let Some(dir) = self.configs.get(path) {
@ -220,7 +227,10 @@ impl<'s> ConfigEngine<'s> {
let check_filename = engine.check_filename(); let check_filename = engine.check_filename();
let check_file = engine.check_file(); let check_file = engine.check_file();
let crate::config::EngineConfig { let crate::config::EngineConfig {
tokenizer, dict, .. tokenizer,
dict,
extend_ignore_re,
..
} = engine; } = engine;
let tokenizer_config = let tokenizer_config =
tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
@ -254,12 +264,15 @@ impl<'s> ConfigEngine<'s> {
let dict = self.dict.intern(dict); let dict = self.dict.intern(dict);
let tokenizer = self.tokenizer.intern(tokenizer); let tokenizer = self.tokenizer.intern(tokenizer);
let ignore = self.ignore.intern(extend_ignore_re);
FileConfig { FileConfig {
check_filenames: check_filename, check_filenames: check_filename,
check_files: check_file, check_files: check_file,
binary, binary,
tokenizer, tokenizer,
dict, dict,
ignore,
} }
} }
} }
@ -328,20 +341,22 @@ struct FileConfig {
check_filenames: bool, check_filenames: bool,
check_files: bool, check_files: bool,
binary: bool, binary: bool,
ignore: usize,
} }
#[non_exhaustive] #[non_exhaustive]
#[derive(derive_setters::Setters)] #[derive(derive_setters::Setters)]
pub struct Policy<'t, 'd> { pub struct Policy<'t, 'd, 'i> {
pub check_filenames: bool, pub check_filenames: bool,
pub check_files: bool, pub check_files: bool,
pub file_type: Option<&'d str>, pub file_type: Option<&'d str>,
pub binary: bool, pub binary: bool,
pub tokenizer: &'t typos::tokens::Tokenizer, pub tokenizer: &'t typos::tokens::Tokenizer,
pub dict: &'d dyn typos::Dictionary, pub dict: &'d dyn typos::Dictionary,
pub ignore: &'i [regex::Regex],
} }
impl<'t, 'd> Policy<'t, 'd> { impl<'t, 'd, 'i> Policy<'t, 'd, 'i> {
pub fn new() -> Self { pub fn new() -> Self {
Default::default() Default::default()
} }
@ -350,8 +365,9 @@ impl<'t, 'd> Policy<'t, 'd> {
static DEFAULT_TOKENIZER: once_cell::sync::Lazy<typos::tokens::Tokenizer> = static DEFAULT_TOKENIZER: once_cell::sync::Lazy<typos::tokens::Tokenizer> =
once_cell::sync::Lazy::new(typos::tokens::Tokenizer::new); once_cell::sync::Lazy::new(typos::tokens::Tokenizer::new);
static DEFAULT_DICT: crate::dict::BuiltIn = crate::dict::BuiltIn::new(crate::config::Locale::En); static DEFAULT_DICT: crate::dict::BuiltIn = crate::dict::BuiltIn::new(crate::config::Locale::En);
static DEFAULT_IGNORE: &[regex::Regex] = &[];
impl<'t, 'd> Default for Policy<'t, 'd> { impl<'t, 'd, 'i> Default for Policy<'t, 'd, 'i> {
fn default() -> Self { fn default() -> Self {
Self { Self {
check_filenames: true, check_filenames: true,
@ -360,6 +376,7 @@ impl<'t, 'd> Default for Policy<'t, 'd> {
binary: false, binary: false,
tokenizer: &DEFAULT_TOKENIZER, tokenizer: &DEFAULT_TOKENIZER,
dict: &DEFAULT_DICT, dict: &DEFAULT_DICT,
ignore: DEFAULT_IGNORE,
} }
} }
} }

View file

@ -0,0 +1,8 @@
[files]
extend-exclude = ["_typos.toml"]
[default]
extend-ignore-re = ["`.*`"]
[default.extend-identifiers]
hello = "goodbye"

View file

@ -0,0 +1 @@
hello `hello`

View file

@ -0,0 +1,12 @@
bin.name = "typos"
stdin = ""
stdout = """
error: `hello` should be `goodbye`
--> ./file.ignore:1:1
|
1 | hello `hello`
| ^^^^^
|
"""
stderr = ""
status.code = 2

View file

@ -86,6 +86,12 @@ impl<'m> Typo<'m> {
corrections: self.corrections.borrow(), corrections: self.corrections.borrow(),
} }
} }
pub fn span(&self) -> std::ops::Range<usize> {
let start = self.byte_offset;
let end = start + self.typo.len();
start..end
}
} }
impl<'m> Default for Typo<'m> { impl<'m> Default for Typo<'m> {

View file

@ -634,6 +634,13 @@ impl<'t> Identifier<'t> {
self.offset self.offset
} }
#[inline]
pub fn span(&self) -> std::ops::Range<usize> {
let start = self.offset;
let end = start + self.token.len();
start..end
}
/// Split into individual Words. /// Split into individual Words.
#[inline] #[inline]
pub fn split(&self) -> impl Iterator<Item = Word<'t>> { pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
@ -702,6 +709,13 @@ impl<'t> Word<'t> {
pub fn offset(&self) -> usize { pub fn offset(&self) -> usize {
self.offset self.offset
} }
#[inline]
pub fn span(&self) -> std::ops::Range<usize> {
let start = self.offset;
let end = start + self.token.len();
start..end
}
} }
struct SplitIdent<'s> { struct SplitIdent<'s> {

View file

@ -27,6 +27,7 @@ Configuration is read from the following (in precedence order)
| default.check-file | \- | bool | Verifying spelling in files. | | default.check-file | \- | bool | Verifying spelling in files. |
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
| default.extend-ignore-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Custom uncorrectable sections (e.g. markdown code fences, PGP signatures, etc) |
| default.extend-identifiers | \- | table of strings | Corrections for [identifiers](./design.md#identifiers-and-words). When the correction is blank, the identifier is never valid. When the correction is the key, the identifier is always valid. | | default.extend-identifiers | \- | table of strings | Corrections for [identifiers](./design.md#identifiers-and-words). When the correction is blank, the identifier is never valid. When the correction is the key, the identifier is always valid. |
| default.extend-ignore-identifiers-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Pattern-match always-valid identifiers | | default.extend-ignore-identifiers-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Pattern-match always-valid identifiers |
| default.extend-words | \- | table of strings | Corrections for [words](./design.md#identifiers-and-words). When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for [words](./design.md#identifiers-and-words). When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |