fix(parser): Flip leading digits to work correctly

This commit is contained in:
Ed Page 2021-04-27 10:17:00 -05:00
parent 4b94352b7a
commit f15cc58f71

View file

@ -53,10 +53,8 @@ impl TokenizerBuilder {
Tokenizer { Tokenizer {
words_str, words_str,
// `leading_digits` let's us bypass the regexes since you can't have a decimal or leading_digits: self.leading_digits,
// hexadecimal number without a leading digit. ignore_hex: self.ignore_hex,
ignore_numbers: self.leading_digits,
ignore_hex: self.ignore_hex && self.leading_digits,
} }
} }
@ -89,7 +87,7 @@ impl Default for TokenizerBuilder {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Tokenizer { pub struct Tokenizer {
words_str: regex::Regex, words_str: regex::Regex,
ignore_numbers: bool, leading_digits: bool,
ignore_hex: bool, ignore_hex: bool,
} }
@ -115,13 +113,18 @@ impl Tokenizer {
} }
fn accept(&self, contents: &str) -> bool { fn accept(&self, contents: &str) -> bool {
if self.ignore_numbers && is_number(contents.as_bytes()) { debug_assert!(!contents.is_empty());
if self.leading_digits {
if is_number(contents.as_bytes()) {
return false; return false;
} }
if self.ignore_hex && is_hex(contents.as_bytes()) { if self.ignore_hex && is_hex(contents.as_bytes()) {
return false; return false;
} }
} else if is_digit(contents.as_bytes()[0]) {
return false;
}
true true
} }
@ -543,7 +546,10 @@ mod test {
#[test] #[test]
fn tokenize_ignore_hex_enabled() { fn tokenize_ignore_hex_enabled() {
let parser = TokenizerBuilder::new().ignore_hex(true).build(); let parser = TokenizerBuilder::new()
.ignore_hex(true)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World"; let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
@ -575,6 +581,44 @@ mod test {
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test]
fn tokenize_leading_digits_enabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0Hello", Case::None, 6),
Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
Identifier::new_unchecked("World", Case::None, 28),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits_disabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(false)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", Case::None, 28),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test] #[test]
fn split_ident() { fn split_ident() {
let cases = [ let cases = [