Merge pull request #66 from epage/digits

perf: Use standard identifier rules to avoid doing umber checks
2025-01-23 23:18:57 -05:00 · 2019-11-02 19:55:34 -06:00 · 2019-11-02 19:55:34 -06:00 · 15210c928c
commit 15210c928c
parent c05ab4f9dc 19321d9e48
4 changed files with 95 additions and 27 deletions
--- a/benches/file.rs
+++ b/benches/file.rs
@ -52,32 +52,32 @@ fn bench_split_lines(data: &str, b: &mut test::Bencher) {
 }

 #[bench]
-fn parse_words_lines_empty(b: &mut test::Bencher) {
+fn parse_lines_empty(b: &mut test::Bencher) {
    bench_split_lines(data::EMPTY, b);
 }

 #[bench]
-fn parse_words_lines_no_tokens(b: &mut test::Bencher) {
+fn parse_lines_no_tokens(b: &mut test::Bencher) {
    bench_split_lines(data::NO_TOKENS, b);
 }

 #[bench]
-fn parse_words_lines_single_token(b: &mut test::Bencher) {
+fn parse_lines_single_token(b: &mut test::Bencher) {
    bench_split_lines(data::SINGLE_TOKEN, b);
 }

 #[bench]
-fn parse_words_lines_sherlock(b: &mut test::Bencher) {
+fn parse_lines_sherlock(b: &mut test::Bencher) {
    bench_split_lines(data::SHERLOCK, b);
 }

 #[bench]
-fn parse_words_lines_code(b: &mut test::Bencher) {
+fn parse_lines_code(b: &mut test::Bencher) {
    bench_split_lines(data::CODE, b);
 }

 #[bench]
-fn parse_words_lines_corpus(b: &mut test::Bencher) {
+fn parse_lines_corpus(b: &mut test::Bencher) {
    bench_split_lines(data::CORPUS, b);
 }

--- a/src/config.rs
+++ b/src/config.rs
@ -53,22 +53,32 @@ pub trait FileSource {
        None
    }

-    /// Verifying spelling in filess.
+    /// Verifying spelling in files.
    fn check_file(&self) -> Option<bool> {
        None
    }

-    /// Do not check identifiers that appear to be hexadecimal values
+    /// Do not check identifiers that appear to be hexadecimal values.
    fn ignore_hex(&self) -> Option<bool> {
        None
    }

-    /// Allow identifiers to include digits, in addition to letters
+    /// Allow identifiers to start with digits, in addition to letters.
+    fn identifier_leading_digits(&self) -> Option<bool> {
+        None
+    }
+
+    /// Allow identifiers to start with one of these characters.
+    fn identifier_leading_chars(&self) -> Option<&str> {
+        None
+    }
+
+    /// Allow identifiers to include digits, in addition to letters.
    fn identifier_include_digits(&self) -> Option<bool> {
        None
    }

-    /// Specify additional characters to be included in identifiers
+    /// Allow identifiers to include these characters.
    fn identifier_include_chars(&self) -> Option<&str> {
        None
    }
@ -233,6 +243,8 @@ pub struct FileConfig {
    pub check_filename: Option<bool>,
    pub check_file: Option<bool>,
    pub ignore_hex: Option<bool>,
+    pub identifier_leading_digits: Option<bool>,
+    pub identifier_leading_chars: Option<String>,
    pub identifier_include_digits: Option<bool>,
    pub identifier_include_chars: Option<String>,
 }
@ -248,6 +260,12 @@ impl FileConfig {
        if let Some(source) = source.ignore_hex() {
            self.ignore_hex = Some(source);
        }
+        if let Some(source) = source.identifier_leading_digits() {
+            self.identifier_leading_digits = Some(source);
+        }
+        if let Some(source) = source.identifier_leading_chars() {
+            self.identifier_leading_chars = Some(source.to_owned());
+        }
        if let Some(source) = source.identifier_include_digits() {
            self.identifier_include_digits = Some(source);
        }
@ -268,6 +286,17 @@ impl FileConfig {
        self.ignore_hex.unwrap_or(true)
    }

+    pub fn identifier_leading_digits(&self) -> bool {
+        self.identifier_leading_digits.unwrap_or(false)
+    }
+
+    pub fn identifier_leading_chars(&self) -> &str {
+        self.identifier_leading_chars
+            .as_ref()
+            .map(|s| s.as_str())
+            .unwrap_or("_")
+    }
+
    pub fn identifier_include_digits(&self) -> bool {
        self.identifier_include_digits.unwrap_or(true)
    }
@ -293,6 +322,14 @@ impl FileSource for FileConfig {
        self.ignore_hex
    }

+    fn identifier_leading_digits(&self) -> Option<bool> {
+        self.identifier_leading_digits
+    }
+
+    fn identifier_leading_chars(&self) -> Option<&str> {
+        self.identifier_leading_chars.as_ref().map(|s| s.as_str())
+    }
+
    fn identifier_include_digits(&self) -> Option<bool> {
        self.identifier_include_digits
    }
--- a/src/main.rs
+++ b/src/main.rs
@ -412,6 +412,8 @@ fn run() -> Result<i32, anyhow::Error> {

        let parser = typos::tokens::ParserBuilder::new()
            .ignore_hex(config.default.ignore_hex())
+            .leading_digits(config.default.identifier_leading_digits())
+            .leading_chars(config.default.identifier_leading_chars().to_owned())
            .include_digits(config.default.identifier_include_digits())
            .include_chars(config.default.identifier_include_chars().to_owned())
            .build();
--- a/typos/src/tokens.rs
+++ b/typos/src/tokens.rs
@ -9,6 +9,8 @@ pub enum Case {
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct ParserBuilder {
    ignore_hex: bool,
+    leading_digits: bool,
+    leading_chars: String,
    include_digits: bool,
    include_chars: String,
 }
@ -23,6 +25,16 @@ impl ParserBuilder {
        self
    }

+    pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
+        self.leading_digits = yes;
+        self
+    }
+
+    pub fn leading_chars(&mut self, chars: String) -> &mut Self {
+        self.leading_chars = chars;
+        self
+    }
+
    pub fn include_digits(&mut self, yes: bool) -> &mut Self {
        self.include_digits = yes;
        self
@ -34,31 +46,44 @@ impl ParserBuilder {
    }

    pub fn build(&self) -> Parser {
-        let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned();
-        if self.include_digits {
-            pattern.push_str(r#"|\d"#);
-        }
-        for grapheme in
-            unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
-        {
-            let escaped = regex::escape(&grapheme);
-            pattern.push_str(&format!("|{}", escaped));
-        }
-        pattern.push_str(r#")+\b"#);
+        let mut pattern = r#"\b("#.to_owned();
+        Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
+        Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
+        pattern.push_str(r#"*)\b"#);
+        let pattern = dbg!(pattern);
+
        let words_str = regex::Regex::new(&pattern).unwrap();
        let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
+
        Parser {
            words_str,
            words_bytes,
-            ignore_hex: self.ignore_hex && self.include_digits,
+            // `leading_digits` let's us bypass the regexes since you can't have a decimal or
+            // hexadecimal number without a leading digit.
+            ignore_numbers: self.leading_digits,
+            ignore_hex: self.ignore_hex && self.leading_digits,
        }
    }
+
+    fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
+        pattern.push_str(r#"(\p{Alphabetic}"#);
+        if digits {
+            pattern.push_str(r#"|\d"#);
+        }
+        for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
+            let escaped = regex::escape(&grapheme);
+            pattern.push_str(&format!("|{}", escaped));
+        }
+        pattern.push_str(r#")"#);
+    }
 }

 impl Default for ParserBuilder {
    fn default() -> Self {
        Self {
            ignore_hex: true,
+            leading_digits: false,
+            leading_chars: "_".to_owned(),
            include_digits: true,
            include_chars: "_'".to_owned(),
        }
@ -69,6 +94,7 @@ impl Default for ParserBuilder {
 pub struct Parser {
    words_str: regex::Regex,
    words_bytes: regex::bytes::Regex,
+    ignore_numbers: bool,
    ignore_hex: bool,
 }

@ -95,12 +121,12 @@ impl Parser {
    }

    fn accept(&self, contents: &[u8]) -> bool {
-        if is_number(contents) {
+        if self.ignore_numbers && is_number(contents) {
            return false;
-        };
+        }

-        if self.ignore_hex {
-            return !is_hex(contents);
+        if self.ignore_hex && is_hex(contents) {
+            return false;
        }

        true
@ -455,7 +481,10 @@ mod test {

    #[test]
    fn tokenize_ignore_hex_disabled() {
-        let parser = ParserBuilder::new().ignore_hex(false).build();
+        let parser = ParserBuilder::new()
+            .ignore_hex(false)
+            .leading_digits(true)
+            .build();

        let input = "Hello 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![