From 32f5e6c682d0668981fa8ab3c07caad559daf0c3 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Tue, 29 Jun 2021 10:40:58 -0500
Subject: [PATCH] refactor(typos)!: Bake ignores into parser

This is prep for other items to be ignored

BREAKING CHANGE: `TokenizerBuilder` no longer takes config for ignoring
tokens.  Related, we now ignore token-ignore config flags.
---
 crates/typos/src/tokens.rs | 311 +++++++++++++++++--------------------
 docs/reference.md          |   2 -
 src/policy.rs              |   9 +-
 3 files changed, 150 insertions(+), 172 deletions(-)
diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs
index 0d8f7a2..941f95d 100644
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@@ -4,8 +4,6 @@ use bstr::ByteSlice;
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TokenizerBuilder {
     unicode: bool,
-    ignore_hex: bool,
-    leading_digits: bool,
 }
 
 impl TokenizerBuilder {
@@ -19,39 +17,15 @@ impl TokenizerBuilder {
         self
     }
 
-    /// Specify that hexadecimal numbers should be ignored.
-    pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
-        self.ignore_hex = yes;
-        self
-    }
-
-    /// Specify that leading digits are allowed for Identifiers.
-    pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
-        self.leading_digits = yes;
-        self
-    }
-
     pub fn build(&self) -> Tokenizer {
-        let TokenizerBuilder {
-            unicode,
-            leading_digits,
-            ignore_hex,
-        } = self.clone();
-        Tokenizer {
-            unicode,
-            leading_digits,
-            ignore_hex,
-        }
+        let TokenizerBuilder { unicode } = self.clone();
+        Tokenizer { unicode }
     }
 }
 
 impl Default for TokenizerBuilder {
     fn default() -> Self {
-        Self {
-            unicode: true,
-            leading_digits: false,
-            ignore_hex: true,
-        }
+        Self { unicode: true }
     }
 }
 
@@ -59,8 +33,6 @@ impl Default for TokenizerBuilder {
 #[derive(Debug, Clone)]
 pub struct Tokenizer {
     unicode: bool,
-    leading_digits: bool,
-    ignore_hex: bool,
 }
 
 impl Tokenizer {
@@ -70,9 +42,9 @@ impl Tokenizer {
 
     pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
         let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
-            itertools::Either::Left(unicode_parser::iter_literals(content))
+            itertools::Either::Left(unicode_parser::iter_identifiers(content))
         } else {
-            itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
+            itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes()))
         };
         iter.filter_map(move |identifier| {
             let offset = offset(content.as_bytes(), identifier.as_bytes());
@@ -82,10 +54,11 @@ impl Tokenizer {
 
     pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
         let iter = if self.unicode && !ByteSlice::is_ascii(content) {
-            let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
+            let iter =
+                Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c));
             itertools::Either::Left(iter)
         } else {
-            itertools::Either::Right(ascii_parser::iter_literals(content))
+            itertools::Either::Right(ascii_parser::iter_identifiers(content))
         };
         iter.filter_map(move |identifier| {
             let offset = offset(content, identifier.as_bytes());
@@ -95,17 +68,6 @@ impl Tokenizer {
 
     fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
         debug_assert!(!identifier.is_empty());
-        if self.leading_digits {
-            if is_number(identifier.as_bytes()) {
-                return None;
-            }
-
-            if self.ignore_hex && is_hex(identifier.as_bytes()) {
-                return None;
-            }
-        } else if is_digit(identifier.as_bytes()[0]) {
-            return None;
-        }
 
         let case = Case::None;
         Some(Identifier::new_unchecked(identifier, case, offset))
@@ -164,98 +126,155 @@ impl<'s> Iterator for Utf8Chunks<'s> {
     }
 }
 
-fn is_number(ident: &[u8]) -> bool {
-    ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
-}
-
-fn is_hex(ident: &[u8]) -> bool {
-    if ident.len() < 3 {
-        false
-    } else {
-        ident[0] == b'0'
-            && ident[1] == b'x'
-            && ident[2..]
-                .iter()
-                .all(|b| is_hex_digit(*b) || is_digit_sep(*b))
-    }
-}
-
-#[inline]
-fn is_digit(chr: u8) -> bool {
-    chr.is_ascii_digit()
-}
-
-#[inline]
-fn is_digit_sep(chr: u8) -> bool {
-    // `_`: number literal separator in Rust and other languages
-    // `'`: number literal separator in C++
-    chr == b'_' || chr == b'\''
-}
-
-#[inline]
-fn is_hex_digit(chr: u8) -> bool {
-    chr.is_ascii_hexdigit()
-}
-
 mod parser {
+    use nom::branch::*;
     use nom::bytes::complete::*;
+    use nom::character::complete::*;
     use nom::sequence::*;
-    use nom::IResult;
+    use nom::{AsChar, IResult};
 
-    pub(crate) trait AsChar: nom::AsChar {
-        #[allow(clippy::wrong_self_convention)]
-        fn is_xid_continue(self) -> bool;
-    }
-
-    impl AsChar for u8 {
-        fn is_xid_continue(self) -> bool {
-            (b'a'..=b'z').contains(&self)
-                || (b'A'..=b'Z').contains(&self)
-                || (b'0'..=b'9').contains(&self)
-                || self == b'_'
-        }
-    }
-
-    impl AsChar for char {
-        fn is_xid_continue(self) -> bool {
-            unicode_xid::UnicodeXID::is_xid_continue(self)
-        }
-    }
-
-    pub(crate) fn next_literal<T>(input: T) -> IResult<T, T>
+    pub(crate) fn next_identifier<T>(input: T) -> IResult<T, T>
     where
-        T: nom::InputTakeAtPosition,
-        <T as nom::InputTakeAtPosition>::Item: AsChar,
+        T: nom::InputTakeAtPosition
+            + nom::InputTake
+            + nom::InputIter
+            + nom::InputLength
+            + nom::Slice<std::ops::RangeFrom<usize>>
+            + nom::Offset
+            + Clone
+            + PartialEq
+            + std::fmt::Debug,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+        <T as nom::InputIter>::Item: AsChar + Copy,
     {
-        preceded(literal_sep, identifier)(input)
-    }
-
-    fn literal_sep<T>(input: T) -> IResult<T, T>
-    where
-        T: nom::InputTakeAtPosition,
-        <T as nom::InputTakeAtPosition>::Item: AsChar,
-    {
-        take_till(AsChar::is_xid_continue)(input)
+        preceded(ignore, identifier)(input)
     }
 
     fn identifier<T>(input: T) -> IResult<T, T>
     where
         T: nom::InputTakeAtPosition,
-        <T as nom::InputTakeAtPosition>::Item: AsChar,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
     {
         // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
         // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
         // or unexpected cases than strip off start characters to a word since we aren't doing a
         // proper word boundary parse
-        take_while1(AsChar::is_xid_continue)(input)
+        take_while1(is_xid_continue)(input)
+    }
+
+    fn ignore<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition
+            + nom::InputTake
+            + nom::InputIter
+            + nom::InputLength
+            + nom::Slice<std::ops::RangeFrom<usize>>
+            + nom::Offset
+            + Clone
+            + PartialEq
+            + std::fmt::Debug,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+        <T as nom::InputIter>::Item: AsChar + Copy,
+    {
+        take_many0(alt((
+            sep1,
+            terminated(hex_literal, sep1),
+            terminated(dec_literal, sep1),
+        )))(input)
+    }
+
+    fn sep1<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+    {
+        take_till1(is_xid_continue)(input)
+    }
+
+    fn dec_literal<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+    {
+        take_while1(is_dec_digit)(input)
+    }
+
+    fn hex_literal<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition
+            + nom::InputTake
+            + nom::InputIter
+            + nom::InputLength
+            + nom::Slice<std::ops::RangeFrom<usize>>
+            + Clone,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+        <T as nom::InputIter>::Item: AsChar + Copy,
+    {
+        preceded(
+            pair(char('0'), alt((char('x'), char('X')))),
+            take_while1(is_hex_digit),
+        )(input)
+    }
+
+    fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
+    where
+        I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
+        F: nom::Parser<I, I, E>,
+        E: nom::error::ParseError<I>,
+    {
+        move |i: I| {
+            let mut current = i.clone();
+            loop {
+                match f.parse(current.clone()) {
+                    Err(nom::Err::Error(_)) => {
+                        let offset = i.offset(&current);
+                        let (after, before) = i.take_split(offset);
+                        return Ok((after, before));
+                    }
+                    Err(e) => {
+                        return Err(e);
+                    }
+                    Ok((next, _)) => {
+                        if next == current {
+                            return Err(nom::Err::Error(E::from_error_kind(
+                                i,
+                                nom::error::ErrorKind::Many0,
+                            )));
+                        }
+
+                        current = next;
+                    }
+                }
+            }
+        }
+    }
+
+    fn is_dec_digit(i: impl AsChar + Copy) -> bool {
+        i.is_dec_digit() || is_digit_sep(i.as_char())
+    }
+
+    fn is_hex_digit(i: impl AsChar + Copy) -> bool {
+        i.is_hex_digit() || is_digit_sep(i.as_char())
+    }
+
+    fn is_xid_continue(i: impl AsChar + Copy) -> bool {
+        let c = i.as_char();
+        unicode_xid::UnicodeXID::is_xid_continue(c)
+    }
+
+    #[inline]
+    fn is_digit_sep(chr: char) -> bool {
+        // `_`: number literal separator in Rust and other languages
+        // `'`: number literal separator in C++
+        chr == '_' || chr == '\''
     }
 }
 
 mod unicode_parser {
-    use super::parser::next_literal;
+    use super::parser::next_identifier;
 
-    pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
-        std::iter::from_fn(move || match next_literal(input) {
+    pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator<Item = &str> {
+        std::iter::from_fn(move || match next_identifier(input) {
             Ok((i, o)) => {
                 input = i;
                 debug_assert_ne!(o, "");
@@ -267,10 +286,10 @@ mod unicode_parser {
 }
 
 mod ascii_parser {
-    use super::parser::next_literal;
+    use super::parser::next_identifier;
 
-    pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
-        std::iter::from_fn(move || match next_literal(input) {
+    pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator<Item = &str> {
+        std::iter::from_fn(move || match next_identifier(input) {
             Ok((i, o)) => {
                 input = i;
                 debug_assert_ne!(o, b"");
@@ -613,11 +632,8 @@ mod test {
     }
 
     #[test]
-    fn tokenize_ignore_hex_enabled() {
-        let parser = TokenizerBuilder::new()
-            .ignore_hex(true)
-            .leading_digits(true)
-            .build();
+    fn tokenize_ignore_hex() {
+        let parser = TokenizerBuilder::new().build();
 
         let input = "Hello 0xDEADBEEF World";
         let expected: Vec<Identifier> = vec![
@@ -631,54 +647,13 @@ mod test {
     }
 
     #[test]
-    fn tokenize_ignore_hex_disabled() {
-        let parser = TokenizerBuilder::new()
-            .ignore_hex(false)
-            .leading_digits(true)
-            .build();
-
-        let input = "Hello 0xDEADBEEF World";
-        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("Hello", Case::None, 0),
-            Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
-            Identifier::new_unchecked("World", Case::None, 17),
-        ];
-        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
-        assert_eq!(expected, actual);
-        let actual: Vec<_> = parser.parse_str(input).collect();
-        assert_eq!(expected, actual);
-    }
-
-    #[test]
-    fn tokenize_leading_digits_enabled() {
-        let parser = TokenizerBuilder::new()
-            .ignore_hex(false)
-            .leading_digits(true)
-            .build();
+    fn tokenize_leading_digits() {
+        let parser = TokenizerBuilder::new().build();
 
         let input = "Hello 0Hello 124 0xDEADBEEF World";
         let expected: Vec<Identifier> = vec![
             Identifier::new_unchecked("Hello", Case::None, 0),
             Identifier::new_unchecked("0Hello", Case::None, 6),
-            Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
-            Identifier::new_unchecked("World", Case::None, 28),
-        ];
-        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
-        assert_eq!(expected, actual);
-        let actual: Vec<_> = parser.parse_str(input).collect();
-        assert_eq!(expected, actual);
-    }
-
-    #[test]
-    fn tokenize_leading_digits_disabled() {
-        let parser = TokenizerBuilder::new()
-            .ignore_hex(false)
-            .leading_digits(false)
-            .build();
-
-        let input = "Hello 0Hello 124 0xDEADBEEF World";
-        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("Hello", Case::None, 0),
             Identifier::new_unchecked("World", Case::None, 28),
         ];
         let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
diff --git a/docs/reference.md b/docs/reference.md
index 13e1625..67247bb 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order)
 | default.check-filename | \-                | bool   | Verifying spelling in file names. |
 | default.check-file     | \-                | bool   | Verifying spelling in files. |
 | default.unicode        | --unicode         | bool   | Allow unicode characters in identifiers (and not just ASCII) |
-| default.ignore-hex     | \-                | bool   | Do not check identifiers that appear to be hexadecimal values. |
-| default.identifier-leading-digits   | \-   | bool   | Allow identifiers to start with digits, in addition to letters. |
 | default.locale         | --locale          | en, en-us, en-gb, en-ca, en-au   | English dialect to correct to. |
 | default.extend-identifiers | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
 | default.extend-words       | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
diff --git a/src/policy.rs b/src/policy.rs
index 86233d7..ede2fa6 100644
--- a/src/policy.rs
+++ b/src/policy.rs
@@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> {
             tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
         let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
 
+        if !tokenizer_config.ignore_hex() {
+            log::warn!("`ignore-hex` is deprecated");
+            if !tokenizer_config.identifier_leading_digits() {
+                log::warn!("`identifier-leading-digits` is deprecated");
+            }
+        }
+
         let tokenizer = typos::tokens::TokenizerBuilder::new()
             .unicode(tokenizer_config.unicode())
-            .ignore_hex(tokenizer_config.ignore_hex())
-            .leading_digits(tokenizer_config.identifier_leading_digits())
             .build();
 
         let dict = crate::dict::BuiltIn::new(dict_config.locale());