diff --git a/benches/file.rs b/benches/file.rs index ab9e21b..b35831e 100644 --- a/benches/file.rs +++ b/benches/file.rs @@ -52,32 +52,32 @@ fn bench_split_lines(data: &str, b: &mut test::Bencher) { } #[bench] -fn parse_words_lines_empty(b: &mut test::Bencher) { +fn parse_lines_empty(b: &mut test::Bencher) { bench_split_lines(data::EMPTY, b); } #[bench] -fn parse_words_lines_no_tokens(b: &mut test::Bencher) { +fn parse_lines_no_tokens(b: &mut test::Bencher) { bench_split_lines(data::NO_TOKENS, b); } #[bench] -fn parse_words_lines_single_token(b: &mut test::Bencher) { +fn parse_lines_single_token(b: &mut test::Bencher) { bench_split_lines(data::SINGLE_TOKEN, b); } #[bench] -fn parse_words_lines_sherlock(b: &mut test::Bencher) { +fn parse_lines_sherlock(b: &mut test::Bencher) { bench_split_lines(data::SHERLOCK, b); } #[bench] -fn parse_words_lines_code(b: &mut test::Bencher) { +fn parse_lines_code(b: &mut test::Bencher) { bench_split_lines(data::CODE, b); } #[bench] -fn parse_words_lines_corpus(b: &mut test::Bencher) { +fn parse_lines_corpus(b: &mut test::Bencher) { bench_split_lines(data::CORPUS, b); } diff --git a/src/config.rs b/src/config.rs index 2c73aba..9880df0 100644 --- a/src/config.rs +++ b/src/config.rs @@ -53,22 +53,32 @@ pub trait FileSource { None } - /// Verifying spelling in filess. + /// Verifying spelling in files. fn check_file(&self) -> Option { None } - /// Do not check identifiers that appear to be hexadecimal values + /// Do not check identifiers that appear to be hexadecimal values. fn ignore_hex(&self) -> Option { None } - /// Allow identifiers to include digits, in addition to letters + /// Allow identifiers to start with digits, in addition to letters. + fn identifier_leading_digits(&self) -> Option { + None + } + + /// Allow identifiers to start with one of these characters. + fn identifier_leading_chars(&self) -> Option<&str> { + None + } + + /// Allow identifiers to include digits, in addition to letters. fn identifier_include_digits(&self) -> Option { None } - /// Specify additional characters to be included in identifiers + /// Allow identifiers to include these characters. fn identifier_include_chars(&self) -> Option<&str> { None } @@ -233,6 +243,8 @@ pub struct FileConfig { pub check_filename: Option, pub check_file: Option, pub ignore_hex: Option, + pub identifier_leading_digits: Option, + pub identifier_leading_chars: Option, pub identifier_include_digits: Option, pub identifier_include_chars: Option, } @@ -248,6 +260,12 @@ impl FileConfig { if let Some(source) = source.ignore_hex() { self.ignore_hex = Some(source); } + if let Some(source) = source.identifier_leading_digits() { + self.identifier_leading_digits = Some(source); + } + if let Some(source) = source.identifier_leading_chars() { + self.identifier_leading_chars = Some(source.to_owned()); + } if let Some(source) = source.identifier_include_digits() { self.identifier_include_digits = Some(source); } @@ -268,6 +286,17 @@ impl FileConfig { self.ignore_hex.unwrap_or(true) } + pub fn identifier_leading_digits(&self) -> bool { + self.identifier_leading_digits.unwrap_or(false) + } + + pub fn identifier_leading_chars(&self) -> &str { + self.identifier_leading_chars + .as_ref() + .map(|s| s.as_str()) + .unwrap_or("_") + } + pub fn identifier_include_digits(&self) -> bool { self.identifier_include_digits.unwrap_or(true) } @@ -293,6 +322,14 @@ impl FileSource for FileConfig { self.ignore_hex } + fn identifier_leading_digits(&self) -> Option { + self.identifier_leading_digits + } + + fn identifier_leading_chars(&self) -> Option<&str> { + self.identifier_leading_chars.as_ref().map(|s| s.as_str()) + } + fn identifier_include_digits(&self) -> Option { self.identifier_include_digits } diff --git a/src/main.rs b/src/main.rs index 86fdf9a..d942aa4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -412,6 +412,8 @@ fn run() -> Result { let parser = typos::tokens::ParserBuilder::new() .ignore_hex(config.default.ignore_hex()) + .leading_digits(config.default.identifier_leading_digits()) + .leading_chars(config.default.identifier_leading_chars().to_owned()) .include_digits(config.default.identifier_include_digits()) .include_chars(config.default.identifier_include_chars().to_owned()) .build(); diff --git a/typos/src/tokens.rs b/typos/src/tokens.rs index 883f1a5..91b6784 100644 --- a/typos/src/tokens.rs +++ b/typos/src/tokens.rs @@ -9,6 +9,8 @@ pub enum Case { #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ParserBuilder { ignore_hex: bool, + leading_digits: bool, + leading_chars: String, include_digits: bool, include_chars: String, } @@ -23,6 +25,16 @@ impl ParserBuilder { self } + pub fn leading_digits(&mut self, yes: bool) -> &mut Self { + self.leading_digits = yes; + self + } + + pub fn leading_chars(&mut self, chars: String) -> &mut Self { + self.leading_chars = chars; + self + } + pub fn include_digits(&mut self, yes: bool) -> &mut Self { self.include_digits = yes; self @@ -34,31 +46,44 @@ impl ParserBuilder { } pub fn build(&self) -> Parser { - let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned(); - if self.include_digits { - pattern.push_str(r#"|\d"#); - } - for grapheme in - unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true) - { - let escaped = regex::escape(&grapheme); - pattern.push_str(&format!("|{}", escaped)); - } - pattern.push_str(r#")+\b"#); + let mut pattern = r#"\b("#.to_owned(); + Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars); + Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars); + pattern.push_str(r#"*)\b"#); + let pattern = dbg!(pattern); + let words_str = regex::Regex::new(&pattern).unwrap(); let words_bytes = regex::bytes::Regex::new(&pattern).unwrap(); + Parser { words_str, words_bytes, - ignore_hex: self.ignore_hex && self.include_digits, + // `leading_digits` let's us bypass the regexes since you can't have a decimal or + // hexadecimal number without a leading digit. + ignore_numbers: self.leading_digits, + ignore_hex: self.ignore_hex && self.leading_digits, } } + + fn push_pattern(pattern: &mut String, digits: bool, chars: &str) { + pattern.push_str(r#"(\p{Alphabetic}"#); + if digits { + pattern.push_str(r#"|\d"#); + } + for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) { + let escaped = regex::escape(&grapheme); + pattern.push_str(&format!("|{}", escaped)); + } + pattern.push_str(r#")"#); + } } impl Default for ParserBuilder { fn default() -> Self { Self { ignore_hex: true, + leading_digits: false, + leading_chars: "_".to_owned(), include_digits: true, include_chars: "_'".to_owned(), } @@ -69,6 +94,7 @@ impl Default for ParserBuilder { pub struct Parser { words_str: regex::Regex, words_bytes: regex::bytes::Regex, + ignore_numbers: bool, ignore_hex: bool, } @@ -95,12 +121,12 @@ impl Parser { } fn accept(&self, contents: &[u8]) -> bool { - if is_number(contents) { + if self.ignore_numbers && is_number(contents) { return false; - }; + } - if self.ignore_hex { - return !is_hex(contents); + if self.ignore_hex && is_hex(contents) { + return false; } true @@ -455,7 +481,10 @@ mod test { #[test] fn tokenize_ignore_hex_disabled() { - let parser = ParserBuilder::new().ignore_hex(false).build(); + let parser = ParserBuilder::new() + .ignore_hex(false) + .leading_digits(true) + .build(); let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![