From 32f5e6c682d0668981fa8ab3c07caad559daf0c3 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 10:40:58 -0500 Subject: [PATCH] refactor(typos)!: Bake ignores into parser This is prep for other items to be ignored BREAKING CHANGE: `TokenizerBuilder` no longer takes config for ignoring tokens. Related, we now ignore token-ignore config flags. --- crates/typos/src/tokens.rs | 311 +++++++++++++++++-------------------- docs/reference.md | 2 - src/policy.rs | 9 +- 3 files changed, 150 insertions(+), 172 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 0d8f7a2..941f95d 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -4,8 +4,6 @@ use bstr::ByteSlice; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TokenizerBuilder { unicode: bool, - ignore_hex: bool, - leading_digits: bool, } impl TokenizerBuilder { @@ -19,39 +17,15 @@ impl TokenizerBuilder { self } - /// Specify that hexadecimal numbers should be ignored. - pub fn ignore_hex(&mut self, yes: bool) -> &mut Self { - self.ignore_hex = yes; - self - } - - /// Specify that leading digits are allowed for Identifiers. - pub fn leading_digits(&mut self, yes: bool) -> &mut Self { - self.leading_digits = yes; - self - } - pub fn build(&self) -> Tokenizer { - let TokenizerBuilder { - unicode, - leading_digits, - ignore_hex, - } = self.clone(); - Tokenizer { - unicode, - leading_digits, - ignore_hex, - } + let TokenizerBuilder { unicode } = self.clone(); + Tokenizer { unicode } } } impl Default for TokenizerBuilder { fn default() -> Self { - Self { - unicode: true, - leading_digits: false, - ignore_hex: true, - } + Self { unicode: true } } } @@ -59,8 +33,6 @@ impl Default for TokenizerBuilder { #[derive(Debug, Clone)] pub struct Tokenizer { unicode: bool, - leading_digits: bool, - ignore_hex: bool, } impl Tokenizer { @@ -70,9 +42,9 @@ impl Tokenizer { pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) { - itertools::Either::Left(unicode_parser::iter_literals(content)) + itertools::Either::Left(unicode_parser::iter_identifiers(content)) } else { - itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) + itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes())) }; iter.filter_map(move |identifier| { let offset = offset(content.as_bytes(), identifier.as_bytes()); @@ -82,10 +54,11 @@ impl Tokenizer { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { let iter = if self.unicode && !ByteSlice::is_ascii(content) { - let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); + let iter = + Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c)); itertools::Either::Left(iter) } else { - itertools::Either::Right(ascii_parser::iter_literals(content)) + itertools::Either::Right(ascii_parser::iter_identifiers(content)) }; iter.filter_map(move |identifier| { let offset = offset(content, identifier.as_bytes()); @@ -95,17 +68,6 @@ impl Tokenizer { fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option> { debug_assert!(!identifier.is_empty()); - if self.leading_digits { - if is_number(identifier.as_bytes()) { - return None; - } - - if self.ignore_hex && is_hex(identifier.as_bytes()) { - return None; - } - } else if is_digit(identifier.as_bytes()[0]) { - return None; - } let case = Case::None; Some(Identifier::new_unchecked(identifier, case, offset)) @@ -164,98 +126,155 @@ impl<'s> Iterator for Utf8Chunks<'s> { } } -fn is_number(ident: &[u8]) -> bool { - ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b)) -} - -fn is_hex(ident: &[u8]) -> bool { - if ident.len() < 3 { - false - } else { - ident[0] == b'0' - && ident[1] == b'x' - && ident[2..] - .iter() - .all(|b| is_hex_digit(*b) || is_digit_sep(*b)) - } -} - -#[inline] -fn is_digit(chr: u8) -> bool { - chr.is_ascii_digit() -} - -#[inline] -fn is_digit_sep(chr: u8) -> bool { - // `_`: number literal separator in Rust and other languages - // `'`: number literal separator in C++ - chr == b'_' || chr == b'\'' -} - -#[inline] -fn is_hex_digit(chr: u8) -> bool { - chr.is_ascii_hexdigit() -} - mod parser { + use nom::branch::*; use nom::bytes::complete::*; + use nom::character::complete::*; use nom::sequence::*; - use nom::IResult; + use nom::{AsChar, IResult}; - pub(crate) trait AsChar: nom::AsChar { - #[allow(clippy::wrong_self_convention)] - fn is_xid_continue(self) -> bool; - } - - impl AsChar for u8 { - fn is_xid_continue(self) -> bool { - (b'a'..=b'z').contains(&self) - || (b'A'..=b'Z').contains(&self) - || (b'0'..=b'9').contains(&self) - || self == b'_' - } - } - - impl AsChar for char { - fn is_xid_continue(self) -> bool { - unicode_xid::UnicodeXID::is_xid_continue(self) - } - } - - pub(crate) fn next_literal(input: T) -> IResult + pub(crate) fn next_identifier(input: T) -> IResult where - T: nom::InputTakeAtPosition, - ::Item: AsChar, + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Slice> + + nom::Offset + + Clone + + PartialEq + + std::fmt::Debug, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, { - preceded(literal_sep, identifier)(input) - } - - fn literal_sep(input: T) -> IResult - where - T: nom::InputTakeAtPosition, - ::Item: AsChar, - { - take_till(AsChar::is_xid_continue)(input) + preceded(ignore, identifier)(input) } fn identifier(input: T) -> IResult where T: nom::InputTakeAtPosition, - ::Item: AsChar, + ::Item: AsChar + Copy, { // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd // or unexpected cases than strip off start characters to a word since we aren't doing a // proper word boundary parse - take_while1(AsChar::is_xid_continue)(input) + take_while1(is_xid_continue)(input) + } + + fn ignore(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Slice> + + nom::Offset + + Clone + + PartialEq + + std::fmt::Debug, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + take_many0(alt(( + sep1, + terminated(hex_literal, sep1), + terminated(dec_literal, sep1), + )))(input) + } + + fn sep1(input: T) -> IResult + where + T: nom::InputTakeAtPosition, + ::Item: AsChar + Copy, + { + take_till1(is_xid_continue)(input) + } + + fn dec_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition, + ::Item: AsChar + Copy, + { + take_while1(is_dec_digit)(input) + } + + fn hex_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Slice> + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + preceded( + pair(char('0'), alt((char('x'), char('X')))), + take_while1(is_hex_digit), + )(input) + } + + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult + where + I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, + F: nom::Parser, + E: nom::error::ParseError, + { + move |i: I| { + let mut current = i.clone(); + loop { + match f.parse(current.clone()) { + Err(nom::Err::Error(_)) => { + let offset = i.offset(¤t); + let (after, before) = i.take_split(offset); + return Ok((after, before)); + } + Err(e) => { + return Err(e); + } + Ok((next, _)) => { + if next == current { + return Err(nom::Err::Error(E::from_error_kind( + i, + nom::error::ErrorKind::Many0, + ))); + } + + current = next; + } + } + } + } + } + + fn is_dec_digit(i: impl AsChar + Copy) -> bool { + i.is_dec_digit() || is_digit_sep(i.as_char()) + } + + fn is_hex_digit(i: impl AsChar + Copy) -> bool { + i.is_hex_digit() || is_digit_sep(i.as_char()) + } + + fn is_xid_continue(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + unicode_xid::UnicodeXID::is_xid_continue(c) + } + + #[inline] + fn is_digit_sep(chr: char) -> bool { + // `_`: number literal separator in Rust and other languages + // `'`: number literal separator in C++ + chr == '_' || chr == '\'' } } mod unicode_parser { - use super::parser::next_literal; + use super::parser::next_identifier; - pub(crate) fn iter_literals(mut input: &str) -> impl Iterator { - std::iter::from_fn(move || match next_literal(input) { + pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator { + std::iter::from_fn(move || match next_identifier(input) { Ok((i, o)) => { input = i; debug_assert_ne!(o, ""); @@ -267,10 +286,10 @@ mod unicode_parser { } mod ascii_parser { - use super::parser::next_literal; + use super::parser::next_identifier; - pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator { - std::iter::from_fn(move || match next_literal(input) { + pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator { + std::iter::from_fn(move || match next_identifier(input) { Ok((i, o)) => { input = i; debug_assert_ne!(o, b""); @@ -613,11 +632,8 @@ mod test { } #[test] - fn tokenize_ignore_hex_enabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(true) - .leading_digits(true) - .build(); + fn tokenize_ignore_hex() { + let parser = TokenizerBuilder::new().build(); let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ @@ -631,54 +647,13 @@ mod test { } #[test] - fn tokenize_ignore_hex_disabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(false) - .leading_digits(true) - .build(); - - let input = "Hello 0xDEADBEEF World"; - let expected: Vec = vec![ - Identifier::new_unchecked("Hello", Case::None, 0), - Identifier::new_unchecked("0xDEADBEEF", Case::None, 6), - Identifier::new_unchecked("World", Case::None, 17), - ]; - let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); - assert_eq!(expected, actual); - let actual: Vec<_> = parser.parse_str(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_leading_digits_enabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(false) - .leading_digits(true) - .build(); + fn tokenize_leading_digits() { + let parser = TokenizerBuilder::new().build(); let input = "Hello 0Hello 124 0xDEADBEEF World"; let expected: Vec = vec![ Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("0Hello", Case::None, 6), - Identifier::new_unchecked("0xDEADBEEF", Case::None, 17), - Identifier::new_unchecked("World", Case::None, 28), - ]; - let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); - assert_eq!(expected, actual); - let actual: Vec<_> = parser.parse_str(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_leading_digits_disabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(false) - .leading_digits(false) - .build(); - - let input = "Hello 0Hello 124 0xDEADBEEF World"; - let expected: Vec = vec![ - Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("World", Case::None, 28), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); diff --git a/docs/reference.md b/docs/reference.md index 13e1625..67247bb 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order) | default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-file | \- | bool | Verifying spelling in files. | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | -| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. | -| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | diff --git a/src/policy.rs b/src/policy.rs index 86233d7..ede2fa6 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> { tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); + if !tokenizer_config.ignore_hex() { + log::warn!("`ignore-hex` is deprecated"); + if !tokenizer_config.identifier_leading_digits() { + log::warn!("`identifier-leading-digits` is deprecated"); + } + } + let tokenizer = typos::tokens::TokenizerBuilder::new() .unicode(tokenizer_config.unicode()) - .ignore_hex(tokenizer_config.ignore_hex()) - .leading_digits(tokenizer_config.identifier_leading_digits()) .build(); let dict = crate::dict::BuiltIn::new(dict_config.locale());