fix(parser)!: Defer to Unicode XID for identifiers

This saves us from having to have configuration for every detail. If people need more control, we can offer it later. Fixes #225
2024-11-28 12:01:06 -05:00 · 2021-04-21 20:30:32 -05:00 · 2021-04-21 20:30:32 -05:00 · 9cbc7410a4
commit 9cbc7410a4
parent f15cc58f71
6 changed files with 57 additions and 98 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1032,7 +1032,7 @@ version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
 dependencies = [
- "unicode-xid 0.2.1",
+ "unicode-xid 0.2.2",
 ]

 [[package]]
@ -1391,7 +1391,7 @@ checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
 dependencies = [
 "proc-macro2 1.0.24",
 "quote 1.0.9",
- "unicode-xid 0.2.1",
+ "unicode-xid 0.2.2",
 ]

 [[package]]
@ -1499,12 +1499,13 @@ dependencies = [
 "anyhow",
 "itertools 0.10.0",
 "log",
+ "nom",
 "once_cell",
- "regex",
 "serde",
 "simdutf8",
 "thiserror",
 "unicode-segmentation",
+ "unicode-xid 0.2.2",
 ]

 [[package]]
@ -1638,9 +1639,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"

 [[package]]
 name = "unicode-xid"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
+checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"

 [[package]]
 name = "uuid"
--- a/crates/typos/Cargo.toml
+++ b/crates/typos/Cargo.toml
@ -17,7 +17,8 @@ codecov = { repository = "crate-ci/typos" }
 [dependencies]
 anyhow = "1.0"
 thiserror = "1.0"
-regex = "1.3"
+nom = "6.0"
+unicode-xid = "0.2.2"
 once_cell = "1.2.0"
 serde = { version = "1.0", features = ["derive"] }
 simdutf8 = "0.1.1"
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@ -3,9 +3,6 @@
 pub struct TokenizerBuilder {
    ignore_hex: bool,
    leading_digits: bool,
-    leading_chars: String,
-    include_digits: bool,
-    include_chars: String,
 }

 impl TokenizerBuilder {
@ -25,60 +22,23 @@ impl TokenizerBuilder {
        self
    }

-    /// Extend accepted leading characters for Identifiers.
-    pub fn leading_chars(&mut self, chars: String) -> &mut Self {
-        self.leading_chars = chars;
-        self
-    }
-
-    /// Specify that digits can be included in Identifiers.
-    pub fn include_digits(&mut self, yes: bool) -> &mut Self {
-        self.include_digits = yes;
-        self
-    }
-
-    /// Extend accepted characters for Identifiers.
-    pub fn include_chars(&mut self, chars: String) -> &mut Self {
-        self.include_chars = chars;
-        self
-    }
-
    pub fn build(&self) -> Tokenizer {
-        let mut pattern = r#"\b("#.to_owned();
-        Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
-        Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
-        pattern.push_str(r#"*)\b"#);
-
-        let words_str = regex::Regex::new(&pattern).unwrap();
-
+        let TokenizerBuilder {
+            leading_digits,
+            ignore_hex,
+        } = self.clone();
        Tokenizer {
-            words_str,
-            leading_digits: self.leading_digits,
-            ignore_hex: self.ignore_hex,
+            leading_digits,
+            ignore_hex,
        }
    }
-
-    fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
-        pattern.push_str(r#"(\p{Alphabetic}"#);
-        if digits {
-            pattern.push_str(r#"|\d"#);
-        }
-        for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
-            let escaped = regex::escape(&grapheme);
-            pattern.push_str(&format!("|{}", escaped));
-        }
-        pattern.push(')');
-    }
 }

 impl Default for TokenizerBuilder {
    fn default() -> Self {
        Self {
-            ignore_hex: true,
            leading_digits: false,
-            leading_chars: "_".to_owned(),
-            include_digits: true,
-            include_chars: "_'".to_owned(),
+            ignore_hex: true,
        }
    }
 }
@ -86,7 +46,6 @@ impl Default for TokenizerBuilder {
 /// Extract Identifiers from a buffer.
 #[derive(Debug, Clone)]
 pub struct Tokenizer {
-    words_str: regex::Regex,
    leading_digits: bool,
    ignore_hex: bool,
 }
@ -97,10 +56,15 @@ impl Tokenizer {
    }

    pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
-        self.words_str
-            .find_iter(content)
-            .filter(move |m| self.accept(m.as_str()))
-            .map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start()))
+        parser::iter_literals(content).filter_map(move |identifier| {
+            let case = Case::None;
+            let offset = offset(content.as_bytes(), identifier.as_bytes());
+            if self.accept(identifier) {
+                Some(Identifier::new_unchecked(identifier, case, offset))
+            } else {
+                None
+            }
+        })
    }

    pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
@ -216,6 +180,39 @@ fn is_hex_digit(chr: u8) -> bool {
    chr.is_ascii_hexdigit()
 }

+mod parser {
+    use nom::bytes::complete::*;
+    use nom::sequence::*;
+    use nom::IResult;
+
+    pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
+        std::iter::from_fn(move || match next_literal(input) {
+            Ok((i, o)) => {
+                input = i;
+                assert_ne!(o, "");
+                Some(o)
+            }
+            _ => None,
+        })
+    }
+
+    fn next_literal(input: &str) -> IResult<&str, &str> {
+        preceded(literal_sep, identifier)(input)
+    }
+
+    fn literal_sep(input: &str) -> IResult<&str, &str> {
+        take_till(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input)
+    }
+
+    fn identifier(input: &str) -> IResult<&str, &str> {
+        // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
+        // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
+        // or unexpected cases than strip off start characters to a word since we aren't doing a
+        // proper word boundary parse
+        take_while1(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input)
+    }
+}
+
 /// A term composed of Words.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Identifier<'t> {
--- a/docs/reference.md
+++ b/docs/reference.md
@ -25,9 +25,6 @@ Configuration is read from the following (in precedence order)
 | default.check-file     | \-                | bool   | Verifying spelling in files. |
 | default.ignore-hex     | \-                | bool   | Do not check identifiers that appear to be hexadecimal values. |
 | default.identifier-leading-digits   | \-   | bool   | Allow identifiers to start with digits, in addition to letters. |
-| default.identifier-include-digits   | \-   | bool   | Allow identifiers to include digits, in addition to letters. |
-| default.identifier-leading-chars    | \-   | string | Allow identifiers to start with one of these characters. |
-| default.identifier-include-chars    | \-   | string | Allow identifiers to include these characters. |
 | default.locale         | --locale          | en, en-us, en-gb, en-ca, en-au   | English dialect to correct to. |
 | default.extend-identifiers | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
 | default.extend-words       | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
--- a/src/config.rs
+++ b/src/config.rs
@ -234,12 +234,6 @@ pub struct TokenizerConfig {
    pub ignore_hex: Option<bool>,
    /// Allow identifiers to start with digits, in addition to letters.
    pub identifier_leading_digits: Option<bool>,
-    /// Allow identifiers to start with one of these characters.
-    pub identifier_leading_chars: Option<kstring::KString>,
-    /// Allow identifiers to include digits, in addition to letters.
-    pub identifier_include_digits: Option<bool>,
-    /// Allow identifiers to include these characters.
-    pub identifier_include_chars: Option<kstring::KString>,
 }

 impl TokenizerConfig {
@ -248,13 +242,6 @@ impl TokenizerConfig {
        Self {
            ignore_hex: Some(empty.ignore_hex()),
            identifier_leading_digits: Some(empty.identifier_leading_digits()),
-            identifier_leading_chars: Some(kstring::KString::from_ref(
-                empty.identifier_leading_chars(),
-            )),
-            identifier_include_digits: Some(empty.identifier_include_digits()),
-            identifier_include_chars: Some(kstring::KString::from_ref(
-                empty.identifier_include_chars(),
-            )),
        }
    }

@ -265,15 +252,6 @@ impl TokenizerConfig {
        if let Some(source) = source.identifier_leading_digits {
            self.identifier_leading_digits = Some(source);
        }
-        if let Some(source) = source.identifier_leading_chars.as_ref() {
-            self.identifier_leading_chars = Some(source.clone());
-        }
-        if let Some(source) = source.identifier_include_digits {
-            self.identifier_include_digits = Some(source);
-        }
-        if let Some(source) = source.identifier_include_chars.as_ref() {
-            self.identifier_include_chars = Some(source.clone());
-        }
    }

    pub fn ignore_hex(&self) -> bool {
@ -283,18 +261,6 @@ impl TokenizerConfig {
    pub fn identifier_leading_digits(&self) -> bool {
        self.identifier_leading_digits.unwrap_or(false)
    }
-
-    pub fn identifier_leading_chars(&self) -> &str {
-        self.identifier_leading_chars.as_deref().unwrap_or("_")
-    }
-
-    pub fn identifier_include_digits(&self) -> bool {
-        self.identifier_include_digits.unwrap_or(true)
-    }
-
-    pub fn identifier_include_chars(&self) -> &str {
-        self.identifier_include_chars.as_deref().unwrap_or("_'")
-    }
 }

 #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
--- a/src/policy.rs
+++ b/src/policy.rs
@ -219,9 +219,6 @@ impl<'s> ConfigEngine<'s> {
        let tokenizer = typos::tokens::TokenizerBuilder::new()
            .ignore_hex(tokenizer_config.ignore_hex())
            .leading_digits(tokenizer_config.identifier_leading_digits())
-            .leading_chars(tokenizer_config.identifier_leading_chars().to_owned())
-            .include_digits(tokenizer_config.identifier_include_digits())
-            .include_chars(tokenizer_config.identifier_include_chars().to_owned())
            .build();

        let dict = crate::dict::BuiltIn::new(dict_config.locale());