diff --git a/crates/typos-cli/src/bin/typos-cli/args.rs b/crates/typos-cli/src/bin/typos-cli/args.rs index 8261486..124bf91 100644 --- a/crates/typos-cli/src/bin/typos-cli/args.rs +++ b/crates/typos-cli/src/bin/typos-cli/args.rs @@ -156,6 +156,7 @@ impl FileArgs { locale: self.locale, ..Default::default() }), + extend_ignore_re: Default::default(), } } diff --git a/crates/typos-cli/src/config.rs b/crates/typos-cli/src/config.rs index 4e366b0..a2a7d16 100644 --- a/crates/typos-cli/src/config.rs +++ b/crates/typos-cli/src/config.rs @@ -268,7 +268,7 @@ impl GlobEngineConfig { } } -#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)] //#[serde(deny_unknown_fields)] // Doesn't work with `flatten` #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -283,6 +283,8 @@ pub struct EngineConfig { pub tokenizer: Option, #[serde(flatten)] pub dict: Option, + #[serde(with = "serde_regex")] + pub extend_ignore_re: Vec, } impl EngineConfig { @@ -298,6 +300,7 @@ impl EngineConfig { .unwrap_or_else(TokenizerConfig::from_defaults), ), dict: Some(empty.dict.unwrap_or_else(DictConfig::from_defaults)), + extend_ignore_re: Default::default(), } } @@ -327,6 +330,8 @@ impl EngineConfig { let mut dict = Some(dict); std::mem::swap(&mut dict, &mut self.dict); } + self.extend_ignore_re + .extend(source.extend_ignore_re.iter().cloned()); } pub fn binary(&self) -> bool { @@ -340,8 +345,29 @@ impl EngineConfig { pub fn check_file(&self) -> bool { self.check_file.unwrap_or(true) } + + pub fn extend_ignore_re(&self) -> Box + '_> { + Box::new(self.extend_ignore_re.iter()) + } } +impl PartialEq for EngineConfig { + fn eq(&self, rhs: &Self) -> bool { + self.binary == rhs.binary + && self.check_filename == rhs.check_filename + && self.check_file == rhs.check_file + && self.tokenizer == rhs.tokenizer + && self.dict == rhs.dict + && self + .extend_ignore_re + .iter() + .map(|r| r.as_str()) + .eq(rhs.extend_ignore_re.iter().map(|r| r.as_str())) + } +} + +impl Eq for EngineConfig {} + #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(deny_unknown_fields)] #[serde(default)] diff --git a/crates/typos-cli/src/file.rs b/crates/typos-cli/src/file.rs index f727f79..cd6924e 100644 --- a/crates/typos-cli/src/file.rs +++ b/crates/typos-cli/src/file.rs @@ -48,7 +48,14 @@ impl FileChecker for Typos { reporter.report(msg.into())?; } else { let mut accum_line_num = AccumulateLineNum::new(); + let mut ignores: Option = None; for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(typo.span()) + { + continue; + } let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); let (line, line_offset) = extract_line(&buffer, typo.byte_offset); let msg = report::Typo { @@ -86,7 +93,14 @@ impl FileChecker for FixTypos { } else { let mut fixes = Vec::new(); let mut accum_line_num = AccumulateLineNum::new(); + let mut ignores: Option = None; for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(typo.span()) + { + continue; + } if is_fixable(&typo) { fixes.push(typo.into_owned()); } else { @@ -163,7 +177,14 @@ impl FileChecker for DiffTypos { } else { let mut fixes = Vec::new(); let mut accum_line_num = AccumulateLineNum::new(); + let mut ignores: Option = None; for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(typo.span()) + { + continue; + } if is_fixable(&typo) { fixes.push(typo.into_owned()); } else { @@ -276,7 +297,14 @@ impl FileChecker for Identifiers { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { + let mut ignores: Option = None; for word in policy.tokenizer.parse_bytes(&buffer) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(word.span()) + { + continue; + } // HACK: Don't look up the line_num per entry to better match the performance // of Typos for comparison purposes. We don't really get much out of it // anyway. @@ -329,11 +357,18 @@ impl FileChecker for Words { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { + let mut ignores: Option = None; for word in policy .tokenizer .parse_bytes(&buffer) .flat_map(|i| i.split()) { + if ignores + .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore)) + .is_ignored(word.span()) + { + continue; + } // HACK: Don't look up the line_num per entry to better match the performance // of Typos for comparison purposes. We don't really get much out of it // anyway. @@ -644,6 +679,33 @@ fn walk_entry( Ok(()) } +#[derive(Clone, Debug)] +struct Ignores { + blocks: Vec>, +} + +impl Ignores { + fn new(content: &[u8], ignores: &[regex::Regex]) -> Self { + let mut blocks = Vec::new(); + if let Ok(content) = std::str::from_utf8(content) { + for ignore in ignores { + for mat in ignore.find_iter(content) { + blocks.push(mat.range()); + } + } + } + Self { blocks } + } + + fn is_ignored(&self, span: std::ops::Range) -> bool { + let start = span.start; + let end = span.end.saturating_sub(1); + self.blocks + .iter() + .any(|block| block.contains(&start) || block.contains(&end)) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/crates/typos-cli/src/policy.rs b/crates/typos-cli/src/policy.rs index 054729d..8870341 100644 --- a/crates/typos-cli/src/policy.rs +++ b/crates/typos-cli/src/policy.rs @@ -42,6 +42,7 @@ pub struct ConfigEngine<'s> { walk: Intern, tokenizer: Intern, dict: Intern>, + ignore: Intern>, } impl<'s> ConfigEngine<'s> { @@ -54,6 +55,7 @@ impl<'s> ConfigEngine<'s> { walk: Default::default(), tokenizer: Default::default(), dict: Default::default(), + ignore: Default::default(), } } @@ -88,7 +90,7 @@ impl<'s> ConfigEngine<'s> { dir.type_matcher.definitions() } - pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_> { + pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_, '_> { debug_assert!(path.is_absolute(), "{} is not absolute", path.display()); let dir = self.get_dir(path).expect("`walk()` should be called first"); let (file_type, file_config) = dir.get_file_config(path); @@ -99,6 +101,7 @@ impl<'s> ConfigEngine<'s> { binary: file_config.binary, tokenizer: self.get_tokenizer(&file_config), dict: self.get_dict(&file_config), + ignore: self.get_ignore(&file_config), } } @@ -114,6 +117,10 @@ impl<'s> ConfigEngine<'s> { self.dict.get(file.dict) } + fn get_ignore(&self, file: &FileConfig) -> &[regex::Regex] { + self.ignore.get(file.ignore) + } + fn get_dir(&self, path: &std::path::Path) -> Option<&DirConfig> { for path in path.ancestors() { if let Some(dir) = self.configs.get(path) { @@ -220,7 +227,10 @@ impl<'s> ConfigEngine<'s> { let check_filename = engine.check_filename(); let check_file = engine.check_file(); let crate::config::EngineConfig { - tokenizer, dict, .. + tokenizer, + dict, + extend_ignore_re, + .. } = engine; let tokenizer_config = tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); @@ -254,12 +264,15 @@ impl<'s> ConfigEngine<'s> { let dict = self.dict.intern(dict); let tokenizer = self.tokenizer.intern(tokenizer); + let ignore = self.ignore.intern(extend_ignore_re); + FileConfig { check_filenames: check_filename, check_files: check_file, binary, tokenizer, dict, + ignore, } } } @@ -328,20 +341,22 @@ struct FileConfig { check_filenames: bool, check_files: bool, binary: bool, + ignore: usize, } #[non_exhaustive] #[derive(derive_setters::Setters)] -pub struct Policy<'t, 'd> { +pub struct Policy<'t, 'd, 'i> { pub check_filenames: bool, pub check_files: bool, pub file_type: Option<&'d str>, pub binary: bool, pub tokenizer: &'t typos::tokens::Tokenizer, pub dict: &'d dyn typos::Dictionary, + pub ignore: &'i [regex::Regex], } -impl<'t, 'd> Policy<'t, 'd> { +impl<'t, 'd, 'i> Policy<'t, 'd, 'i> { pub fn new() -> Self { Default::default() } @@ -350,8 +365,9 @@ impl<'t, 'd> Policy<'t, 'd> { static DEFAULT_TOKENIZER: once_cell::sync::Lazy = once_cell::sync::Lazy::new(typos::tokens::Tokenizer::new); static DEFAULT_DICT: crate::dict::BuiltIn = crate::dict::BuiltIn::new(crate::config::Locale::En); +static DEFAULT_IGNORE: &[regex::Regex] = &[]; -impl<'t, 'd> Default for Policy<'t, 'd> { +impl<'t, 'd, 'i> Default for Policy<'t, 'd, 'i> { fn default() -> Self { Self { check_filenames: true, @@ -360,6 +376,7 @@ impl<'t, 'd> Default for Policy<'t, 'd> { binary: false, tokenizer: &DEFAULT_TOKENIZER, dict: &DEFAULT_DICT, + ignore: DEFAULT_IGNORE, } } } diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml index de98037..f248fcb 100644 --- a/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml @@ -1,5 +1,8 @@ [files] extend-exclude = ["_typos.toml"] +[default] +extend-ignore-re = ["`.*`"] + [default.extend-identifiers] hello = "goodbye" diff --git a/crates/typos-cli/tests/cmd/extend-ignore-re.toml b/crates/typos-cli/tests/cmd/extend-ignore-re.toml index f87f083..af1b948 100644 --- a/crates/typos-cli/tests/cmd/extend-ignore-re.toml +++ b/crates/typos-cli/tests/cmd/extend-ignore-re.toml @@ -7,12 +7,6 @@ error: `hello` should be `goodbye` 1 | hello `hello` | ^^^^^ | -error: `hello` should be `goodbye` - --> ./file.ignore:1:8 - | -1 | hello `hello` - | ^^^^^ - | """ stderr = "" status.code = 2 diff --git a/crates/typos/src/check.rs b/crates/typos/src/check.rs index cfe8372..c20edde 100644 --- a/crates/typos/src/check.rs +++ b/crates/typos/src/check.rs @@ -86,6 +86,12 @@ impl<'m> Typo<'m> { corrections: self.corrections.borrow(), } } + + pub fn span(&self) -> std::ops::Range { + let start = self.byte_offset; + let end = start + self.typo.len(); + start..end + } } impl<'m> Default for Typo<'m> { diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index c5ae1f8..db08914 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -634,6 +634,13 @@ impl<'t> Identifier<'t> { self.offset } + #[inline] + pub fn span(&self) -> std::ops::Range { + let start = self.offset; + let end = start + self.token.len(); + start..end + } + /// Split into individual Words. #[inline] pub fn split(&self) -> impl Iterator> { @@ -702,6 +709,13 @@ impl<'t> Word<'t> { pub fn offset(&self) -> usize { self.offset } + + #[inline] + pub fn span(&self) -> std::ops::Range { + let start = self.offset; + let end = start + self.token.len(); + start..end + } } struct SplitIdent<'s> { diff --git a/docs/reference.md b/docs/reference.md index d5d69c6..32ce1dd 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -27,6 +27,7 @@ Configuration is read from the following (in precedence order) | default.check-file | \- | bool | Verifying spelling in files. | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | +| default.extend-ignore-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Custom uncorrectable sections (e.g. markdown code fences, PGP signatures, etc) | | default.extend-identifiers | \- | table of strings | Corrections for [identifiers](./design.md#identifiers-and-words). When the correction is blank, the identifier is never valid. When the correction is the key, the identifier is always valid. | | default.extend-ignore-identifiers-re | \- | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Pattern-match always-valid identifiers | | default.extend-words | \- | table of strings | Corrections for [words](./design.md#identifiers-and-words). When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |