diff --git a/Cargo.lock b/Cargo.lock index e088a4c..c7103e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1032,7 +1032,7 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" dependencies = [ - "unicode-xid 0.2.1", + "unicode-xid 0.2.2", ] [[package]] @@ -1391,7 +1391,7 @@ checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "unicode-xid 0.2.1", + "unicode-xid 0.2.2", ] [[package]] @@ -1499,12 +1499,13 @@ dependencies = [ "anyhow", "itertools 0.10.0", "log", + "nom", "once_cell", - "regex", "serde", "simdutf8", "thiserror", "unicode-segmentation", + "unicode-xid 0.2.2", ] [[package]] @@ -1638,9 +1639,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "uuid" diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index ae8a385..d44573f 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -17,7 +17,8 @@ codecov = { repository = "crate-ci/typos" } [dependencies] anyhow = "1.0" thiserror = "1.0" -regex = "1.3" +nom = "6.0" +unicode-xid = "0.2.2" once_cell = "1.2.0" serde = { version = "1.0", features = ["derive"] } simdutf8 = "0.1.1" diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 0c80410..ae78716 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -3,9 +3,6 @@ pub struct TokenizerBuilder { ignore_hex: bool, leading_digits: bool, - leading_chars: String, - include_digits: bool, - include_chars: String, } impl TokenizerBuilder { @@ -25,60 +22,23 @@ impl TokenizerBuilder { self } - /// Extend accepted leading characters for Identifiers. - pub fn leading_chars(&mut self, chars: String) -> &mut Self { - self.leading_chars = chars; - self - } - - /// Specify that digits can be included in Identifiers. - pub fn include_digits(&mut self, yes: bool) -> &mut Self { - self.include_digits = yes; - self - } - - /// Extend accepted characters for Identifiers. - pub fn include_chars(&mut self, chars: String) -> &mut Self { - self.include_chars = chars; - self - } - pub fn build(&self) -> Tokenizer { - let mut pattern = r#"\b("#.to_owned(); - Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars); - Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars); - pattern.push_str(r#"*)\b"#); - - let words_str = regex::Regex::new(&pattern).unwrap(); - + let TokenizerBuilder { + leading_digits, + ignore_hex, + } = self.clone(); Tokenizer { - words_str, - leading_digits: self.leading_digits, - ignore_hex: self.ignore_hex, + leading_digits, + ignore_hex, } } - - fn push_pattern(pattern: &mut String, digits: bool, chars: &str) { - pattern.push_str(r#"(\p{Alphabetic}"#); - if digits { - pattern.push_str(r#"|\d"#); - } - for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) { - let escaped = regex::escape(&grapheme); - pattern.push_str(&format!("|{}", escaped)); - } - pattern.push(')'); - } } impl Default for TokenizerBuilder { fn default() -> Self { Self { - ignore_hex: true, leading_digits: false, - leading_chars: "_".to_owned(), - include_digits: true, - include_chars: "_'".to_owned(), + ignore_hex: true, } } } @@ -86,7 +46,6 @@ impl Default for TokenizerBuilder { /// Extract Identifiers from a buffer. #[derive(Debug, Clone)] pub struct Tokenizer { - words_str: regex::Regex, leading_digits: bool, ignore_hex: bool, } @@ -97,10 +56,15 @@ impl Tokenizer { } pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { - self.words_str - .find_iter(content) - .filter(move |m| self.accept(m.as_str())) - .map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start())) + parser::iter_literals(content).filter_map(move |identifier| { + let case = Case::None; + let offset = offset(content.as_bytes(), identifier.as_bytes()); + if self.accept(identifier) { + Some(Identifier::new_unchecked(identifier, case, offset)) + } else { + None + } + }) } pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { @@ -216,6 +180,39 @@ fn is_hex_digit(chr: u8) -> bool { chr.is_ascii_hexdigit() } +mod parser { + use nom::bytes::complete::*; + use nom::sequence::*; + use nom::IResult; + + pub(crate) fn iter_literals(mut input: &str) -> impl Iterator { + std::iter::from_fn(move || match next_literal(input) { + Ok((i, o)) => { + input = i; + assert_ne!(o, ""); + Some(o) + } + _ => None, + }) + } + + fn next_literal(input: &str) -> IResult<&str, &str> { + preceded(literal_sep, identifier)(input) + } + + fn literal_sep(input: &str) -> IResult<&str, &str> { + take_till(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input) + } + + fn identifier(input: &str) -> IResult<&str, &str> { + // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only + // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd + // or unexpected cases than strip off start characters to a word since we aren't doing a + // proper word boundary parse + take_while1(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input) + } +} + /// A term composed of Words. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { diff --git a/docs/reference.md b/docs/reference.md index 1feda8a..ccfa36a 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -25,9 +25,6 @@ Configuration is read from the following (in precedence order) | default.check-file | \- | bool | Verifying spelling in files. | | default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. | | default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. | -| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. | -| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. | -| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | diff --git a/src/config.rs b/src/config.rs index 285009e..b8ca31a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -234,12 +234,6 @@ pub struct TokenizerConfig { pub ignore_hex: Option, /// Allow identifiers to start with digits, in addition to letters. pub identifier_leading_digits: Option, - /// Allow identifiers to start with one of these characters. - pub identifier_leading_chars: Option, - /// Allow identifiers to include digits, in addition to letters. - pub identifier_include_digits: Option, - /// Allow identifiers to include these characters. - pub identifier_include_chars: Option, } impl TokenizerConfig { @@ -248,13 +242,6 @@ impl TokenizerConfig { Self { ignore_hex: Some(empty.ignore_hex()), identifier_leading_digits: Some(empty.identifier_leading_digits()), - identifier_leading_chars: Some(kstring::KString::from_ref( - empty.identifier_leading_chars(), - )), - identifier_include_digits: Some(empty.identifier_include_digits()), - identifier_include_chars: Some(kstring::KString::from_ref( - empty.identifier_include_chars(), - )), } } @@ -265,15 +252,6 @@ impl TokenizerConfig { if let Some(source) = source.identifier_leading_digits { self.identifier_leading_digits = Some(source); } - if let Some(source) = source.identifier_leading_chars.as_ref() { - self.identifier_leading_chars = Some(source.clone()); - } - if let Some(source) = source.identifier_include_digits { - self.identifier_include_digits = Some(source); - } - if let Some(source) = source.identifier_include_chars.as_ref() { - self.identifier_include_chars = Some(source.clone()); - } } pub fn ignore_hex(&self) -> bool { @@ -283,18 +261,6 @@ impl TokenizerConfig { pub fn identifier_leading_digits(&self) -> bool { self.identifier_leading_digits.unwrap_or(false) } - - pub fn identifier_leading_chars(&self) -> &str { - self.identifier_leading_chars.as_deref().unwrap_or("_") - } - - pub fn identifier_include_digits(&self) -> bool { - self.identifier_include_digits.unwrap_or(true) - } - - pub fn identifier_include_chars(&self) -> &str { - self.identifier_include_chars.as_deref().unwrap_or("_'") - } } #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] diff --git a/src/policy.rs b/src/policy.rs index 7393be0..725a8eb 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -219,9 +219,6 @@ impl<'s> ConfigEngine<'s> { let tokenizer = typos::tokens::TokenizerBuilder::new() .ignore_hex(tokenizer_config.ignore_hex()) .leading_digits(tokenizer_config.identifier_leading_digits()) - .leading_chars(tokenizer_config.identifier_leading_chars().to_owned()) - .include_digits(tokenizer_config.identifier_include_digits()) - .include_chars(tokenizer_config.identifier_include_chars().to_owned()) .build(); let dict = crate::dict::BuiltIn::new(dict_config.locale());