refactor(typos)!: Bake ignores into parser

This is prep for other items to be ignored BREAKING CHANGE: `TokenizerBuilder` no longer takes config for ignoring tokens. Related, we now ignore token-ignore config flags.
2024-11-28 20:11:05 -05:00 · 2021-06-29 10:40:58 -05:00 · 2021-06-29 10:40:58 -05:00 · 32f5e6c682
commit 32f5e6c682
parent a46cc76bae
3 changed files with 150 additions and 172 deletions
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@ -4,8 +4,6 @@ use bstr::ByteSlice;
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TokenizerBuilder {
    unicode: bool,
    ignore_hex: bool,
    leading_digits: bool,
 }
 impl TokenizerBuilder {
@ -19,39 +17,15 @@ impl TokenizerBuilder {
        self
    }
    /// Specify that hexadecimal numbers should be ignored.
    pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
        self.ignore_hex = yes;
        self
    }
    /// Specify that leading digits are allowed for Identifiers.
    pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
        self.leading_digits = yes;
        self
    }
    pub fn build(&self) -> Tokenizer {
-        let TokenizerBuilder {
+        let TokenizerBuilder { unicode } = self.clone();
-            unicode,
+        Tokenizer { unicode }
            leading_digits,
            ignore_hex,
        } = self.clone();
        Tokenizer {
            unicode,
            leading_digits,
            ignore_hex,
        }
    }
 }
 impl Default for TokenizerBuilder {
    fn default() -> Self {
-        Self {
+        Self { unicode: true }
            unicode: true,
            leading_digits: false,
            ignore_hex: true,
        }
    }
 }
@ -59,8 +33,6 @@ impl Default for TokenizerBuilder {
 #[derive(Debug, Clone)]
 pub struct Tokenizer {
    unicode: bool,
    leading_digits: bool,
    ignore_hex: bool,
 }
 impl Tokenizer {
@ -70,9 +42,9 @@ impl Tokenizer {
    pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
        let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
-            itertools::Either::Left(unicode_parser::iter_literals(content))
+            itertools::Either::Left(unicode_parser::iter_identifiers(content))
        } else {
-            itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
+            itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes()))
        };
        iter.filter_map(move |identifier| {
            let offset = offset(content.as_bytes(), identifier.as_bytes());
@ -82,10 +54,11 @@ impl Tokenizer {
    pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
        let iter = if self.unicode && !ByteSlice::is_ascii(content) {
-            let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
+            let iter =
                Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c));
            itertools::Either::Left(iter)
        } else {
-            itertools::Either::Right(ascii_parser::iter_literals(content))
+            itertools::Either::Right(ascii_parser::iter_identifiers(content))
        };
        iter.filter_map(move |identifier| {
            let offset = offset(content, identifier.as_bytes());
@ -95,17 +68,6 @@ impl Tokenizer {
    fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
        debug_assert!(!identifier.is_empty());
        if self.leading_digits {
            if is_number(identifier.as_bytes()) {
                return None;
            }
            if self.ignore_hex && is_hex(identifier.as_bytes()) {
                return None;
            }
        } else if is_digit(identifier.as_bytes()[0]) {
            return None;
        }
        let case = Case::None;
        Some(Identifier::new_unchecked(identifier, case, offset))
@ -164,98 +126,155 @@ impl<'s> Iterator for Utf8Chunks<'s> {
    }
 }
 fn is_number(ident: &[u8]) -> bool {
    ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
 }
 fn is_hex(ident: &[u8]) -> bool {
    if ident.len() < 3 {
        false
    } else {
        ident[0] == b'0'
            && ident[1] == b'x'
            && ident[2..]
                .iter()
                .all(|b| is_hex_digit(*b) || is_digit_sep(*b))
    }
 }
 #[inline]
 fn is_digit(chr: u8) -> bool {
    chr.is_ascii_digit()
 }
 #[inline]
 fn is_digit_sep(chr: u8) -> bool {
    // `_`: number literal separator in Rust and other languages
    // `'`: number literal separator in C++
    chr == b'_' || chr == b'\''
 }
 #[inline]
 fn is_hex_digit(chr: u8) -> bool {
    chr.is_ascii_hexdigit()
 }
 mod parser {
    use nom::branch::*;
    use nom::bytes::complete::*;
    use nom::character::complete::*;
    use nom::sequence::*;
-    use nom::IResult;
+    use nom::{AsChar, IResult};
-    pub(crate) trait AsChar: nom::AsChar {
+    pub(crate) fn next_identifier<T>(input: T) -> IResult<T, T>
        #[allow(clippy::wrong_self_convention)]
        fn is_xid_continue(self) -> bool;
    }
    impl AsChar for u8 {
        fn is_xid_continue(self) -> bool {
            (b'a'..=b'z').contains(&self)
                || (b'A'..=b'Z').contains(&self)
                || (b'0'..=b'9').contains(&self)
                || self == b'_'
        }
    }
    impl AsChar for char {
        fn is_xid_continue(self) -> bool {
            unicode_xid::UnicodeXID::is_xid_continue(self)
        }
    }
    pub(crate) fn next_literal<T>(input: T) -> IResult<T, T>
    where
-        T: nom::InputTakeAtPosition,
+        T: nom::InputTakeAtPosition
-        <T as nom::InputTakeAtPosition>::Item: AsChar,
+            + nom::InputTake
            + nom::InputIter
            + nom::InputLength
            + nom::Slice<std::ops::RangeFrom<usize>>
            + nom::Offset
            + Clone
            + PartialEq
            + std::fmt::Debug,
        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
        <T as nom::InputIter>::Item: AsChar + Copy,
    {
-        preceded(literal_sep, identifier)(input)
+        preceded(ignore, identifier)(input)
    }
    fn literal_sep<T>(input: T) -> IResult<T, T>
    where
        T: nom::InputTakeAtPosition,
        <T as nom::InputTakeAtPosition>::Item: AsChar,
    {
        take_till(AsChar::is_xid_continue)(input)
    }
    fn identifier<T>(input: T) -> IResult<T, T>
    where
        T: nom::InputTakeAtPosition,
-        <T as nom::InputTakeAtPosition>::Item: AsChar,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
    {
        // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
        // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
        // or unexpected cases than strip off start characters to a word since we aren't doing a
        // proper word boundary parse
-        take_while1(AsChar::is_xid_continue)(input)
+        take_while1(is_xid_continue)(input)
    }
    fn ignore<T>(input: T) -> IResult<T, T>
    where
        T: nom::InputTakeAtPosition
            + nom::InputTake
            + nom::InputIter
            + nom::InputLength
            + nom::Slice<std::ops::RangeFrom<usize>>
            + nom::Offset
            + Clone
            + PartialEq
            + std::fmt::Debug,
        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
        <T as nom::InputIter>::Item: AsChar + Copy,
    {
        take_many0(alt((
            sep1,
            terminated(hex_literal, sep1),
            terminated(dec_literal, sep1),
        )))(input)
    }
    fn sep1<T>(input: T) -> IResult<T, T>
    where
        T: nom::InputTakeAtPosition,
        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
    {
        take_till1(is_xid_continue)(input)
    }
    fn dec_literal<T>(input: T) -> IResult<T, T>
    where
        T: nom::InputTakeAtPosition,
        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
    {
        take_while1(is_dec_digit)(input)
    }
    fn hex_literal<T>(input: T) -> IResult<T, T>
    where
        T: nom::InputTakeAtPosition
            + nom::InputTake
            + nom::InputIter
            + nom::InputLength
            + nom::Slice<std::ops::RangeFrom<usize>>
            + Clone,
        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
        <T as nom::InputIter>::Item: AsChar + Copy,
    {
        preceded(
            pair(char('0'), alt((char('x'), char('X')))),
            take_while1(is_hex_digit),
        )(input)
    }
    fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
    where
        I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
        F: nom::Parser<I, I, E>,
        E: nom::error::ParseError<I>,
    {
        move |i: I| {
            let mut current = i.clone();
            loop {
                match f.parse(current.clone()) {
                    Err(nom::Err::Error(_)) => {
                        let offset = i.offset(&current);
                        let (after, before) = i.take_split(offset);
                        return Ok((after, before));
                    }
                    Err(e) => {
                        return Err(e);
                    }
                    Ok((next, _)) => {
                        if next == current {
                            return Err(nom::Err::Error(E::from_error_kind(
                                i,
                                nom::error::ErrorKind::Many0,
                            )));
                        }
                        current = next;
                    }
                }
            }
        }
    }
    fn is_dec_digit(i: impl AsChar + Copy) -> bool {
        i.is_dec_digit() || is_digit_sep(i.as_char())
    }
    fn is_hex_digit(i: impl AsChar + Copy) -> bool {
        i.is_hex_digit() || is_digit_sep(i.as_char())
    }
    fn is_xid_continue(i: impl AsChar + Copy) -> bool {
        let c = i.as_char();
        unicode_xid::UnicodeXID::is_xid_continue(c)
    }
    #[inline]
    fn is_digit_sep(chr: char) -> bool {
        // `_`: number literal separator in Rust and other languages
        // `'`: number literal separator in C++
        chr == '_' || chr == '\''
    }
 }
 mod unicode_parser {
-    use super::parser::next_literal;
+    use super::parser::next_identifier;
-    pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
+    pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator<Item = &str> {
-        std::iter::from_fn(move || match next_literal(input) {
+        std::iter::from_fn(move || match next_identifier(input) {
            Ok((i, o)) => {
                input = i;
                debug_assert_ne!(o, "");
@ -267,10 +286,10 @@ mod unicode_parser {
 }
 mod ascii_parser {
-    use super::parser::next_literal;
+    use super::parser::next_identifier;
-    pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
+    pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator<Item = &str> {
-        std::iter::from_fn(move || match next_literal(input) {
+        std::iter::from_fn(move || match next_identifier(input) {
            Ok((i, o)) => {
                input = i;
                debug_assert_ne!(o, b"");
@ -613,11 +632,8 @@ mod test {
    }
    #[test]
-    fn tokenize_ignore_hex_enabled() {
+    fn tokenize_ignore_hex() {
-        let parser = TokenizerBuilder::new()
+        let parser = TokenizerBuilder::new().build();
            .ignore_hex(true)
            .leading_digits(true)
            .build();
        let input = "Hello 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![
@ -631,54 +647,13 @@ mod test {
    }
    #[test]
-    fn tokenize_ignore_hex_disabled() {
+    fn tokenize_leading_digits() {
-        let parser = TokenizerBuilder::new()
+        let parser = TokenizerBuilder::new().build();
            .ignore_hex(false)
            .leading_digits(true)
            .build();
        let input = "Hello 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![
            Identifier::new_unchecked("Hello", Case::None, 0),
            Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
            Identifier::new_unchecked("World", Case::None, 17),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = parser.parse_str(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_leading_digits_enabled() {
        let parser = TokenizerBuilder::new()
            .ignore_hex(false)
            .leading_digits(true)
            .build();
        let input = "Hello 0Hello 124 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![
            Identifier::new_unchecked("Hello", Case::None, 0),
            Identifier::new_unchecked("0Hello", Case::None, 6),
            Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
            Identifier::new_unchecked("World", Case::None, 28),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = parser.parse_str(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_leading_digits_disabled() {
        let parser = TokenizerBuilder::new()
            .ignore_hex(false)
            .leading_digits(false)
            .build();
        let input = "Hello 0Hello 124 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![
            Identifier::new_unchecked("Hello", Case::None, 0),
            Identifier::new_unchecked("World", Case::None, 28),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
--- a/docs/reference.md
+++ b/docs/reference.md
@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order)
 | default.check-filename | \-                | bool   | Verifying spelling in file names. |
 | default.check-file     | \-                | bool   | Verifying spelling in files. |
 | default.unicode        | --unicode         | bool   | Allow unicode characters in identifiers (and not just ASCII) |
 | default.ignore-hex     | \-                | bool   | Do not check identifiers that appear to be hexadecimal values. |
 | default.identifier-leading-digits   | \-   | bool   | Allow identifiers to start with digits, in addition to letters. |
 | default.locale         | --locale          | en, en-us, en-gb, en-ca, en-au   | English dialect to correct to. |
 | default.extend-identifiers | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
 | default.extend-words       | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
--- a/src/policy.rs
+++ b/src/policy.rs
@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> {
            tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
        let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
        if !tokenizer_config.ignore_hex() {
            log::warn!("`ignore-hex` is deprecated");
            if !tokenizer_config.identifier_leading_digits() {
                log::warn!("`identifier-leading-digits` is deprecated");
            }
        }
        let tokenizer = typos::tokens::TokenizerBuilder::new()
            .unicode(tokenizer_config.unicode())
            .ignore_hex(tokenizer_config.ignore_hex())
            .leading_digits(tokenizer_config.identifier_leading_digits())
            .build();
        let dict = crate::dict::BuiltIn::new(dict_config.locale());