refactor(typos)!: Bake ignores into parser

This is prep for other items to be ignored

BREAKING CHANGE: `TokenizerBuilder` no longer takes config for ignoring
tokens.  Related, we now ignore token-ignore config flags.
This commit is contained in:
Ed Page 2021-06-29 10:40:58 -05:00
parent a46cc76bae
commit 32f5e6c682
3 changed files with 150 additions and 172 deletions

View file

@ -4,8 +4,6 @@ use bstr::ByteSlice;
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerBuilder { pub struct TokenizerBuilder {
unicode: bool, unicode: bool,
ignore_hex: bool,
leading_digits: bool,
} }
impl TokenizerBuilder { impl TokenizerBuilder {
@ -19,39 +17,15 @@ impl TokenizerBuilder {
self self
} }
/// Specify that hexadecimal numbers should be ignored.
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
self.ignore_hex = yes;
self
}
/// Specify that leading digits are allowed for Identifiers.
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
self.leading_digits = yes;
self
}
pub fn build(&self) -> Tokenizer { pub fn build(&self) -> Tokenizer {
let TokenizerBuilder { let TokenizerBuilder { unicode } = self.clone();
unicode, Tokenizer { unicode }
leading_digits,
ignore_hex,
} = self.clone();
Tokenizer {
unicode,
leading_digits,
ignore_hex,
}
} }
} }
impl Default for TokenizerBuilder { impl Default for TokenizerBuilder {
fn default() -> Self { fn default() -> Self {
Self { Self { unicode: true }
unicode: true,
leading_digits: false,
ignore_hex: true,
}
} }
} }
@ -59,8 +33,6 @@ impl Default for TokenizerBuilder {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Tokenizer { pub struct Tokenizer {
unicode: bool, unicode: bool,
leading_digits: bool,
ignore_hex: bool,
} }
impl Tokenizer { impl Tokenizer {
@ -70,9 +42,9 @@ impl Tokenizer {
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) { let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
itertools::Either::Left(unicode_parser::iter_literals(content)) itertools::Either::Left(unicode_parser::iter_identifiers(content))
} else { } else {
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes()))
}; };
iter.filter_map(move |identifier| { iter.filter_map(move |identifier| {
let offset = offset(content.as_bytes(), identifier.as_bytes()); let offset = offset(content.as_bytes(), identifier.as_bytes());
@ -82,10 +54,11 @@ impl Tokenizer {
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode && !ByteSlice::is_ascii(content) { let iter = if self.unicode && !ByteSlice::is_ascii(content) {
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); let iter =
Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c));
itertools::Either::Left(iter) itertools::Either::Left(iter)
} else { } else {
itertools::Either::Right(ascii_parser::iter_literals(content)) itertools::Either::Right(ascii_parser::iter_identifiers(content))
}; };
iter.filter_map(move |identifier| { iter.filter_map(move |identifier| {
let offset = offset(content, identifier.as_bytes()); let offset = offset(content, identifier.as_bytes());
@ -95,17 +68,6 @@ impl Tokenizer {
fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> { fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
debug_assert!(!identifier.is_empty()); debug_assert!(!identifier.is_empty());
if self.leading_digits {
if is_number(identifier.as_bytes()) {
return None;
}
if self.ignore_hex && is_hex(identifier.as_bytes()) {
return None;
}
} else if is_digit(identifier.as_bytes()[0]) {
return None;
}
let case = Case::None; let case = Case::None;
Some(Identifier::new_unchecked(identifier, case, offset)) Some(Identifier::new_unchecked(identifier, case, offset))
@ -164,98 +126,155 @@ impl<'s> Iterator for Utf8Chunks<'s> {
} }
} }
fn is_number(ident: &[u8]) -> bool {
ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
}
fn is_hex(ident: &[u8]) -> bool {
if ident.len() < 3 {
false
} else {
ident[0] == b'0'
&& ident[1] == b'x'
&& ident[2..]
.iter()
.all(|b| is_hex_digit(*b) || is_digit_sep(*b))
}
}
#[inline]
fn is_digit(chr: u8) -> bool {
chr.is_ascii_digit()
}
#[inline]
fn is_digit_sep(chr: u8) -> bool {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
chr == b'_' || chr == b'\''
}
#[inline]
fn is_hex_digit(chr: u8) -> bool {
chr.is_ascii_hexdigit()
}
mod parser { mod parser {
use nom::branch::*;
use nom::bytes::complete::*; use nom::bytes::complete::*;
use nom::character::complete::*;
use nom::sequence::*; use nom::sequence::*;
use nom::IResult; use nom::{AsChar, IResult};
pub(crate) trait AsChar: nom::AsChar { pub(crate) fn next_identifier<T>(input: T) -> IResult<T, T>
#[allow(clippy::wrong_self_convention)]
fn is_xid_continue(self) -> bool;
}
impl AsChar for u8 {
fn is_xid_continue(self) -> bool {
(b'a'..=b'z').contains(&self)
|| (b'A'..=b'Z').contains(&self)
|| (b'0'..=b'9').contains(&self)
|| self == b'_'
}
}
impl AsChar for char {
fn is_xid_continue(self) -> bool {
unicode_xid::UnicodeXID::is_xid_continue(self)
}
}
pub(crate) fn next_literal<T>(input: T) -> IResult<T, T>
where where
T: nom::InputTakeAtPosition, T: nom::InputTakeAtPosition
<T as nom::InputTakeAtPosition>::Item: AsChar, + nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Offset
+ Clone
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{ {
preceded(literal_sep, identifier)(input) preceded(ignore, identifier)(input)
}
fn literal_sep<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar,
{
take_till(AsChar::is_xid_continue)(input)
} }
fn identifier<T>(input: T) -> IResult<T, T> fn identifier<T>(input: T) -> IResult<T, T>
where where
T: nom::InputTakeAtPosition, T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar, <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{ {
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a // or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse // proper word boundary parse
take_while1(AsChar::is_xid_continue)(input) take_while1(is_xid_continue)(input)
}
fn ignore<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Offset
+ Clone
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
take_many0(alt((
sep1,
terminated(hex_literal, sep1),
terminated(dec_literal, sep1),
)))(input)
}
fn sep1<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{
take_till1(is_xid_continue)(input)
}
fn dec_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{
take_while1(is_dec_digit)(input)
}
fn hex_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
preceded(
pair(char('0'), alt((char('x'), char('X')))),
take_while1(is_hex_digit),
)(input)
}
fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
where
I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
F: nom::Parser<I, I, E>,
E: nom::error::ParseError<I>,
{
move |i: I| {
let mut current = i.clone();
loop {
match f.parse(current.clone()) {
Err(nom::Err::Error(_)) => {
let offset = i.offset(&current);
let (after, before) = i.take_split(offset);
return Ok((after, before));
}
Err(e) => {
return Err(e);
}
Ok((next, _)) => {
if next == current {
return Err(nom::Err::Error(E::from_error_kind(
i,
nom::error::ErrorKind::Many0,
)));
}
current = next;
}
}
}
}
}
fn is_dec_digit(i: impl AsChar + Copy) -> bool {
i.is_dec_digit() || is_digit_sep(i.as_char())
}
fn is_hex_digit(i: impl AsChar + Copy) -> bool {
i.is_hex_digit() || is_digit_sep(i.as_char())
}
fn is_xid_continue(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
unicode_xid::UnicodeXID::is_xid_continue(c)
}
#[inline]
fn is_digit_sep(chr: char) -> bool {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
chr == '_' || chr == '\''
} }
} }
mod unicode_parser { mod unicode_parser {
use super::parser::next_literal; use super::parser::next_identifier;
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> { pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) { std::iter::from_fn(move || match next_identifier(input) {
Ok((i, o)) => { Ok((i, o)) => {
input = i; input = i;
debug_assert_ne!(o, ""); debug_assert_ne!(o, "");
@ -267,10 +286,10 @@ mod unicode_parser {
} }
mod ascii_parser { mod ascii_parser {
use super::parser::next_literal; use super::parser::next_identifier;
pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> { pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) { std::iter::from_fn(move || match next_identifier(input) {
Ok((i, o)) => { Ok((i, o)) => {
input = i; input = i;
debug_assert_ne!(o, b""); debug_assert_ne!(o, b"");
@ -613,11 +632,8 @@ mod test {
} }
#[test] #[test]
fn tokenize_ignore_hex_enabled() { fn tokenize_ignore_hex() {
let parser = TokenizerBuilder::new() let parser = TokenizerBuilder::new().build();
.ignore_hex(true)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World"; let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
@ -631,54 +647,13 @@ mod test {
} }
#[test] #[test]
fn tokenize_ignore_hex_disabled() { fn tokenize_leading_digits() {
let parser = TokenizerBuilder::new() let parser = TokenizerBuilder::new().build();
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
Identifier::new_unchecked("World", Case::None, 17),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits_enabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World"; let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0Hello", Case::None, 6), Identifier::new_unchecked("0Hello", Case::None, 6),
Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
Identifier::new_unchecked("World", Case::None, 28),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits_disabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(false)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", Case::None, 28), Identifier::new_unchecked("World", Case::None, 28),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();

View file

@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order)
| default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-filename | \- | bool | Verifying spelling in file names. |
| default.check-file | \- | bool | Verifying spelling in files. | | default.check-file | \- | bool | Verifying spelling in files. |
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |

View file

@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> {
tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
if !tokenizer_config.ignore_hex() {
log::warn!("`ignore-hex` is deprecated");
if !tokenizer_config.identifier_leading_digits() {
log::warn!("`identifier-leading-digits` is deprecated");
}
}
let tokenizer = typos::tokens::TokenizerBuilder::new() let tokenizer = typos::tokens::TokenizerBuilder::new()
.unicode(tokenizer_config.unicode()) .unicode(tokenizer_config.unicode())
.ignore_hex(tokenizer_config.ignore_hex())
.leading_digits(tokenizer_config.identifier_leading_digits())
.build(); .build();
let dict = crate::dict::BuiltIn::new(dict_config.locale()); let dict = crate::dict::BuiltIn::new(dict_config.locale());