Merge pull request #695 from epage/ignore

feat(config): Custom ignores
2025-01-26 08:28:59 -05:00 · 2023-03-22 15:53:24 -05:00 · 2023-03-22 15:53:24 -05:00 · 5253e5589b
commit 5253e5589b
parent 0d46368bfa ac46a6ba54
10 changed files with 154 additions and 6 deletions
--- a/crates/typos-cli/src/bin/typos-cli/args.rs
+++ b/crates/typos-cli/src/bin/typos-cli/args.rs
@ -156,6 +156,7 @@ impl FileArgs {
                locale: self.locale,
                ..Default::default()
            }),
+            extend_ignore_re: Default::default(),
        }
    }

--- a/crates/typos-cli/src/config.rs
+++ b/crates/typos-cli/src/config.rs
@ -268,7 +268,7 @@ impl GlobEngineConfig {
    }
 }

-#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
 //#[serde(deny_unknown_fields)]  // Doesn't work with `flatten`
 #[serde(default)]
 #[serde(rename_all = "kebab-case")]
@ -283,6 +283,8 @@ pub struct EngineConfig {
    pub tokenizer: Option<TokenizerConfig>,
    #[serde(flatten)]
    pub dict: Option<DictConfig>,
+    #[serde(with = "serde_regex")]
+    pub extend_ignore_re: Vec<regex::Regex>,
 }

 impl EngineConfig {
@ -298,6 +300,7 @@ impl EngineConfig {
                    .unwrap_or_else(TokenizerConfig::from_defaults),
            ),
            dict: Some(empty.dict.unwrap_or_else(DictConfig::from_defaults)),
+            extend_ignore_re: Default::default(),
        }
    }

@ -327,6 +330,8 @@ impl EngineConfig {
            let mut dict = Some(dict);
            std::mem::swap(&mut dict, &mut self.dict);
        }
+        self.extend_ignore_re
+            .extend(source.extend_ignore_re.iter().cloned());
    }

    pub fn binary(&self) -> bool {
@ -340,8 +345,29 @@ impl EngineConfig {
    pub fn check_file(&self) -> bool {
        self.check_file.unwrap_or(true)
    }
+
+    pub fn extend_ignore_re(&self) -> Box<dyn Iterator<Item = &regex::Regex> + '_> {
+        Box::new(self.extend_ignore_re.iter())
+    }
 }

+impl PartialEq for EngineConfig {
+    fn eq(&self, rhs: &Self) -> bool {
+        self.binary == rhs.binary
+            && self.check_filename == rhs.check_filename
+            && self.check_file == rhs.check_file
+            && self.tokenizer == rhs.tokenizer
+            && self.dict == rhs.dict
+            && self
+                .extend_ignore_re
+                .iter()
+                .map(|r| r.as_str())
+                .eq(rhs.extend_ignore_re.iter().map(|r| r.as_str()))
+    }
+}
+
+impl Eq for EngineConfig {}
+
 #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(deny_unknown_fields)]
 #[serde(default)]
--- a/crates/typos-cli/src/file.rs
+++ b/crates/typos-cli/src/file.rs
@ -48,7 +48,14 @@ impl FileChecker for Typos {
                reporter.report(msg.into())?;
            } else {
                let mut accum_line_num = AccumulateLineNum::new();
+                let mut ignores: Option<Ignores> = None;
                for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) {
+                    if ignores
+                        .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
+                        .is_ignored(typo.span())
+                    {
+                        continue;
+                    }
                    let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
                    let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
                    let msg = report::Typo {
@ -86,7 +93,14 @@ impl FileChecker for FixTypos {
            } else {
                let mut fixes = Vec::new();
                let mut accum_line_num = AccumulateLineNum::new();
+                let mut ignores: Option<Ignores> = None;
                for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) {
+                    if ignores
+                        .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
+                        .is_ignored(typo.span())
+                    {
+                        continue;
+                    }
                    if is_fixable(&typo) {
                        fixes.push(typo.into_owned());
                    } else {
@ -163,7 +177,14 @@ impl FileChecker for DiffTypos {
            } else {
                let mut fixes = Vec::new();
                let mut accum_line_num = AccumulateLineNum::new();
+                let mut ignores: Option<Ignores> = None;
                for typo in typos::check_bytes(&buffer, policy.tokenizer, policy.dict) {
+                    if ignores
+                        .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
+                        .is_ignored(typo.span())
+                    {
+                        continue;
+                    }
                    if is_fixable(&typo) {
                        fixes.push(typo.into_owned());
                    } else {
@ -276,7 +297,14 @@ impl FileChecker for Identifiers {
                let msg = report::BinaryFile { path };
                reporter.report(msg.into())?;
            } else {
+                let mut ignores: Option<Ignores> = None;
                for word in policy.tokenizer.parse_bytes(&buffer) {
+                    if ignores
+                        .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
+                        .is_ignored(word.span())
+                    {
+                        continue;
+                    }
                    // HACK: Don't look up the line_num per entry to better match the performance
                    // of Typos for comparison purposes.  We don't really get much out of it
                    // anyway.
@ -329,11 +357,18 @@ impl FileChecker for Words {
                let msg = report::BinaryFile { path };
                reporter.report(msg.into())?;
            } else {
+                let mut ignores: Option<Ignores> = None;
                for word in policy
                    .tokenizer
                    .parse_bytes(&buffer)
                    .flat_map(|i| i.split())
                {
+                    if ignores
+                        .get_or_insert_with(|| Ignores::new(&buffer, policy.ignore))
+                        .is_ignored(word.span())
+                    {
+                        continue;
+                    }
                    // HACK: Don't look up the line_num per entry to better match the performance
                    // of Typos for comparison purposes.  We don't really get much out of it
                    // anyway.
@ -644,6 +679,33 @@ fn walk_entry(
    Ok(())
 }

+#[derive(Clone, Debug)]
+struct Ignores {
+    blocks: Vec<std::ops::Range<usize>>,
+}
+
+impl Ignores {
+    fn new(content: &[u8], ignores: &[regex::Regex]) -> Self {
+        let mut blocks = Vec::new();
+        if let Ok(content) = std::str::from_utf8(content) {
+            for ignore in ignores {
+                for mat in ignore.find_iter(content) {
+                    blocks.push(mat.range());
+                }
+            }
+        }
+        Self { blocks }
+    }
+
+    fn is_ignored(&self, span: std::ops::Range<usize>) -> bool {
+        let start = span.start;
+        let end = span.end.saturating_sub(1);
+        self.blocks
+            .iter()
+            .any(|block| block.contains(&start) || block.contains(&end))
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
--- a/crates/typos-cli/src/policy.rs
+++ b/crates/typos-cli/src/policy.rs
@ -42,6 +42,7 @@ pub struct ConfigEngine<'s> {
    walk: Intern<crate::config::Walk>,
    tokenizer: Intern<typos::tokens::Tokenizer>,
    dict: Intern<crate::dict::Override<'s, 's, crate::dict::BuiltIn>>,
+    ignore: Intern<Vec<regex::Regex>>,
 }

 impl<'s> ConfigEngine<'s> {
@ -54,6 +55,7 @@ impl<'s> ConfigEngine<'s> {
            walk: Default::default(),
            tokenizer: Default::default(),
            dict: Default::default(),
+            ignore: Default::default(),
        }
    }

@ -88,7 +90,7 @@ impl<'s> ConfigEngine<'s> {
        dir.type_matcher.definitions()
    }

-    pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_> {
+    pub fn policy(&self, path: &std::path::Path) -> Policy<'_, '_, '_> {
        debug_assert!(path.is_absolute(), "{} is not absolute", path.display());
        let dir = self.get_dir(path).expect("`walk()` should be called first");
        let (file_type, file_config) = dir.get_file_config(path);
@ -99,6 +101,7 @@ impl<'s> ConfigEngine<'s> {
            binary: file_config.binary,
            tokenizer: self.get_tokenizer(&file_config),
            dict: self.get_dict(&file_config),
+            ignore: self.get_ignore(&file_config),
        }
    }

@ -114,6 +117,10 @@ impl<'s> ConfigEngine<'s> {
        self.dict.get(file.dict)
    }

+    fn get_ignore(&self, file: &FileConfig) -> &[regex::Regex] {
+        self.ignore.get(file.ignore)
+    }
+
    fn get_dir(&self, path: &std::path::Path) -> Option<&DirConfig> {
        for path in path.ancestors() {
            if let Some(dir) = self.configs.get(path) {
@ -220,7 +227,10 @@ impl<'s> ConfigEngine<'s> {
        let check_filename = engine.check_filename();
        let check_file = engine.check_file();
        let crate::config::EngineConfig {
-            tokenizer, dict, ..
+            tokenizer,
+            dict,
+            extend_ignore_re,
+            ..
        } = engine;
        let tokenizer_config =
            tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
@ -254,12 +264,15 @@ impl<'s> ConfigEngine<'s> {
        let dict = self.dict.intern(dict);
        let tokenizer = self.tokenizer.intern(tokenizer);

+        let ignore = self.ignore.intern(extend_ignore_re);
+
        FileConfig {
            check_filenames: check_filename,
            check_files: check_file,
            binary,
            tokenizer,
            dict,
+            ignore,
        }
    }
 }
@ -328,20 +341,22 @@ struct FileConfig {
    check_filenames: bool,
    check_files: bool,
    binary: bool,
+    ignore: usize,
 }

 #[non_exhaustive]
 #[derive(derive_setters::Setters)]
-pub struct Policy<'t, 'd> {
+pub struct Policy<'t, 'd, 'i> {
    pub check_filenames: bool,
    pub check_files: bool,
    pub file_type: Option<&'d str>,
    pub binary: bool,
    pub tokenizer: &'t typos::tokens::Tokenizer,
    pub dict: &'d dyn typos::Dictionary,
+    pub ignore: &'i [regex::Regex],
 }

-impl<'t, 'd> Policy<'t, 'd> {
+impl<'t, 'd, 'i> Policy<'t, 'd, 'i> {
    pub fn new() -> Self {
        Default::default()
    }
@ -350,8 +365,9 @@ impl<'t, 'd> Policy<'t, 'd> {
 static DEFAULT_TOKENIZER: once_cell::sync::Lazy<typos::tokens::Tokenizer> =
    once_cell::sync::Lazy::new(typos::tokens::Tokenizer::new);
 static DEFAULT_DICT: crate::dict::BuiltIn = crate::dict::BuiltIn::new(crate::config::Locale::En);
+static DEFAULT_IGNORE: &[regex::Regex] = &[];

-impl<'t, 'd> Default for Policy<'t, 'd> {
+impl<'t, 'd, 'i> Default for Policy<'t, 'd, 'i> {
    fn default() -> Self {
        Self {
            check_filenames: true,
@ -360,6 +376,7 @@ impl<'t, 'd> Default for Policy<'t, 'd> {
            binary: false,
            tokenizer: &DEFAULT_TOKENIZER,
            dict: &DEFAULT_DICT,
+            ignore: DEFAULT_IGNORE,
        }
    }
 }
--- a/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml
+++ b/crates/typos-cli/tests/cmd/extend-ignore-re.in/_typos.toml
@ -0,0 +1,8 @@
+[files]
+extend-exclude = ["_typos.toml"]
+
+[default]
+extend-ignore-re = ["`.*`"]
+
+[default.extend-identifiers]
+hello = "goodbye"
--- a/crates/typos-cli/tests/cmd/extend-ignore-re.in/file.ignore
+++ b/crates/typos-cli/tests/cmd/extend-ignore-re.in/file.ignore
@ -0,0 +1 @@
+hello `hello`
--- a/crates/typos-cli/tests/cmd/extend-ignore-re.toml
+++ b/crates/typos-cli/tests/cmd/extend-ignore-re.toml
@ -0,0 +1,12 @@
+bin.name = "typos"
+stdin = ""
+stdout = """
+error: `hello` should be `goodbye`
+  --> ./file.ignore:1:1
+  |
+1 | hello `hello`
+  | ^^^^^
+  |
+"""
+stderr = ""
+status.code = 2
--- a/crates/typos/src/check.rs
+++ b/crates/typos/src/check.rs
@ -86,6 +86,12 @@ impl<'m> Typo<'m> {
            corrections: self.corrections.borrow(),
        }
    }
+
+    pub fn span(&self) -> std::ops::Range<usize> {
+        let start = self.byte_offset;
+        let end = start + self.typo.len();
+        start..end
+    }
 }

 impl<'m> Default for Typo<'m> {
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@ -634,6 +634,13 @@ impl<'t> Identifier<'t> {
        self.offset
    }

+    #[inline]
+    pub fn span(&self) -> std::ops::Range<usize> {
+        let start = self.offset;
+        let end = start + self.token.len();
+        start..end
+    }
+
    /// Split into individual Words.
    #[inline]
    pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
@ -702,6 +709,13 @@ impl<'t> Word<'t> {
    pub fn offset(&self) -> usize {
        self.offset
    }
+
+    #[inline]
+    pub fn span(&self) -> std::ops::Range<usize> {
+        let start = self.offset;
+        let end = start + self.token.len();
+        start..end
+    }
 }

 struct SplitIdent<'s> {
--- a/docs/reference.md
+++ b/docs/reference.md
@ -27,6 +27,7 @@ Configuration is read from the following (in precedence order)
 | default.check-file     | \-                | bool   | Verifying spelling in files. |
 | default.unicode        | --unicode         | bool   | Allow unicode characters in identifiers (and not just ASCII) |
 | default.locale         | --locale          | en, en-us, en-gb, en-ca, en-au   | English dialect to correct to. |
+| default.extend-ignore-re   | \-            | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Custom uncorrectable sections (e.g. markdown code fences, PGP signatures, etc) |
 | default.extend-identifiers | \-            | table of strings | Corrections for [identifiers](./design.md#identifiers-and-words). When the correction is blank, the identifier is never valid. When the correction is the key, the identifier is always valid. |
 | default.extend-ignore-identifiers-re | \-            | list of [regexes](https://docs.rs/regex/latest/regex/index.html#syntax) | Pattern-match always-valid identifiers |
 | default.extend-words       | \-            | table of strings | Corrections for [words](./design.md#identifiers-and-words). When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |