From f15cc58f719ece195f7b24d5d24190f7c14ffa01 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 27 Apr 2021 10:17:00 -0500 Subject: [PATCH] fix(parser): Flip leading digits to work correctly --- crates/typos/src/tokens.rs | 64 ++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 26ef2cc..0c80410 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -53,10 +53,8 @@ impl TokenizerBuilder { Tokenizer { words_str, - // `leading_digits` let's us bypass the regexes since you can't have a decimal or - // hexadecimal number without a leading digit. - ignore_numbers: self.leading_digits, - ignore_hex: self.ignore_hex && self.leading_digits, + leading_digits: self.leading_digits, + ignore_hex: self.ignore_hex, } } @@ -89,7 +87,7 @@ impl Default for TokenizerBuilder { #[derive(Debug, Clone)] pub struct Tokenizer { words_str: regex::Regex, - ignore_numbers: bool, + leading_digits: bool, ignore_hex: bool, } @@ -115,11 +113,16 @@ impl Tokenizer { } fn accept(&self, contents: &str) -> bool { - if self.ignore_numbers && is_number(contents.as_bytes()) { - return false; - } + debug_assert!(!contents.is_empty()); + if self.leading_digits { + if is_number(contents.as_bytes()) { + return false; + } - if self.ignore_hex && is_hex(contents.as_bytes()) { + if self.ignore_hex && is_hex(contents.as_bytes()) { + return false; + } + } else if is_digit(contents.as_bytes()[0]) { return false; } @@ -543,7 +546,10 @@ mod test { #[test] fn tokenize_ignore_hex_enabled() { - let parser = TokenizerBuilder::new().ignore_hex(true).build(); + let parser = TokenizerBuilder::new() + .ignore_hex(true) + .leading_digits(true) + .build(); let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ @@ -575,6 +581,44 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_leading_digits_enabled() { + let parser = TokenizerBuilder::new() + .ignore_hex(false) + .leading_digits(true) + .build(); + + let input = "Hello 0Hello 124 0xDEADBEEF World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("0Hello", Case::None, 6), + Identifier::new_unchecked("0xDEADBEEF", Case::None, 17), + Identifier::new_unchecked("World", Case::None, 28), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_leading_digits_disabled() { + let parser = TokenizerBuilder::new() + .ignore_hex(false) + .leading_digits(false) + .build(); + + let input = "Hello 0Hello 124 0xDEADBEEF World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 28), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn split_ident() { let cases = [