Merge pull request #293 from epage/parse

Detect non-identifiers to ignore
This commit is contained in:
Ed Page 2021-06-29 15:03:56 -05:00 committed by GitHub
commit effc21ed10
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 447 additions and 161 deletions

View file

@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
<!-- next-header --> <!-- next-header -->
## [Unreleased] - ReleaseDate ## [Unreleased] - ReleaseDate
#### Change of Behavior
- `ignore-hex` and `identifier-leading-digit` are deprecated and `typos` acts as
if `ignore-hex=true` and `identifier-leading-digit=false`.
#### Features
- Automatically ignore
- UUIDs
- SHAs
- base64 encoded data (must be at least 90 bytes)
- emails
- URLs
#### Performance
- Due to new literal detection, finding identifiers is takes 10x longer.
Combined with word splitting, its only takes 3x longer. The majority of the
time is spent in dictionary lookups, so we don't expect this to have too much impact in the end.
## [1.0.10] - 2021-06-28 ## [1.0.10] - 2021-06-28
#### Bug Fixes #### Bug Fixes

View file

@ -4,8 +4,6 @@ use bstr::ByteSlice;
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerBuilder { pub struct TokenizerBuilder {
unicode: bool, unicode: bool,
ignore_hex: bool,
leading_digits: bool,
} }
impl TokenizerBuilder { impl TokenizerBuilder {
@ -19,39 +17,15 @@ impl TokenizerBuilder {
self self
} }
/// Specify that hexadecimal numbers should be ignored.
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
self.ignore_hex = yes;
self
}
/// Specify that leading digits are allowed for Identifiers.
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
self.leading_digits = yes;
self
}
pub fn build(&self) -> Tokenizer { pub fn build(&self) -> Tokenizer {
let TokenizerBuilder { let TokenizerBuilder { unicode } = self.clone();
unicode, Tokenizer { unicode }
leading_digits,
ignore_hex,
} = self.clone();
Tokenizer {
unicode,
leading_digits,
ignore_hex,
}
} }
} }
impl Default for TokenizerBuilder { impl Default for TokenizerBuilder {
fn default() -> Self { fn default() -> Self {
Self { Self { unicode: true }
unicode: true,
leading_digits: false,
ignore_hex: true,
}
} }
} }
@ -59,8 +33,6 @@ impl Default for TokenizerBuilder {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Tokenizer { pub struct Tokenizer {
unicode: bool, unicode: bool,
leading_digits: bool,
ignore_hex: bool,
} }
impl Tokenizer { impl Tokenizer {
@ -70,9 +42,9 @@ impl Tokenizer {
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) { let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
itertools::Either::Left(unicode_parser::iter_literals(content)) itertools::Either::Left(unicode_parser::iter_identifiers(content))
} else { } else {
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes()))
}; };
iter.filter_map(move |identifier| { iter.filter_map(move |identifier| {
let offset = offset(content.as_bytes(), identifier.as_bytes()); let offset = offset(content.as_bytes(), identifier.as_bytes());
@ -82,10 +54,11 @@ impl Tokenizer {
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode && !ByteSlice::is_ascii(content) { let iter = if self.unicode && !ByteSlice::is_ascii(content) {
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); let iter =
Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c));
itertools::Either::Left(iter) itertools::Either::Left(iter)
} else { } else {
itertools::Either::Right(ascii_parser::iter_literals(content)) itertools::Either::Right(ascii_parser::iter_identifiers(content))
}; };
iter.filter_map(move |identifier| { iter.filter_map(move |identifier| {
let offset = offset(content, identifier.as_bytes()); let offset = offset(content, identifier.as_bytes());
@ -95,17 +68,6 @@ impl Tokenizer {
fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> { fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
debug_assert!(!identifier.is_empty()); debug_assert!(!identifier.is_empty());
if self.leading_digits {
if is_number(identifier.as_bytes()) {
return None;
}
if self.ignore_hex && is_hex(identifier.as_bytes()) {
return None;
}
} else if is_digit(identifier.as_bytes()[0]) {
return None;
}
let case = Case::None; let case = Case::None;
Some(Identifier::new_unchecked(identifier, case, offset)) Some(Identifier::new_unchecked(identifier, case, offset))
@ -164,98 +126,348 @@ impl<'s> Iterator for Utf8Chunks<'s> {
} }
} }
fn is_number(ident: &[u8]) -> bool {
ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
}
fn is_hex(ident: &[u8]) -> bool {
if ident.len() < 3 {
false
} else {
ident[0] == b'0'
&& ident[1] == b'x'
&& ident[2..]
.iter()
.all(|b| is_hex_digit(*b) || is_digit_sep(*b))
}
}
#[inline]
fn is_digit(chr: u8) -> bool {
chr.is_ascii_digit()
}
#[inline]
fn is_digit_sep(chr: u8) -> bool {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
chr == b'_' || chr == b'\''
}
#[inline]
fn is_hex_digit(chr: u8) -> bool {
chr.is_ascii_hexdigit()
}
mod parser { mod parser {
use nom::branch::*;
use nom::bytes::complete::*; use nom::bytes::complete::*;
use nom::character::complete::*;
use nom::combinator::*;
use nom::sequence::*; use nom::sequence::*;
use nom::IResult; use nom::{AsChar, IResult};
pub(crate) trait AsChar: nom::AsChar { pub(crate) fn next_identifier<T>(input: T) -> IResult<T, T>
#[allow(clippy::wrong_self_convention)]
fn is_xid_continue(self) -> bool;
}
impl AsChar for u8 {
fn is_xid_continue(self) -> bool {
(b'a'..=b'z').contains(&self)
|| (b'A'..=b'Z').contains(&self)
|| (b'0'..=b'9').contains(&self)
|| self == b'_'
}
}
impl AsChar for char {
fn is_xid_continue(self) -> bool {
unicode_xid::UnicodeXID::is_xid_continue(self)
}
}
pub(crate) fn next_literal<T>(input: T) -> IResult<T, T>
where where
T: nom::InputTakeAtPosition, T: nom::InputTakeAtPosition
<T as nom::InputTakeAtPosition>::Item: AsChar, + nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{ {
preceded(literal_sep, identifier)(input) preceded(ignore, identifier)(input)
}
fn literal_sep<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar,
{
take_till(AsChar::is_xid_continue)(input)
} }
fn identifier<T>(input: T) -> IResult<T, T> fn identifier<T>(input: T) -> IResult<T, T>
where where
T: nom::InputTakeAtPosition, T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar, <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{ {
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a // or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse // proper word boundary parse
take_while1(AsChar::is_xid_continue)(input) take_while1(is_xid_continue)(input)
}
fn ignore<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
take_many0(alt((
terminated(uuid_literal, sep1),
terminated(hash_literal, sep1),
terminated(hex_literal, sep1),
terminated(dec_literal, sep1),
terminated(base64_literal, sep1),
terminated(email_literal, sep1),
terminated(url_literal, sep1),
sep1,
)))(input)
}
fn sep1<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{
take_till1(is_xid_continue)(input)
}
fn dec_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
{
take_while1(is_dec_digit_with_sep)(input)
}
fn hex_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
preceded(
pair(char('0'), alt((char('x'), char('X')))),
take_while1(is_hex_digit_with_sep),
)(input)
}
fn uuid_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Offset
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Slice<std::ops::RangeFrom<usize>>
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
recognize(tuple((
take_while_m_n(8, 8, is_lower_hex_digit),
char('-'),
take_while_m_n(4, 4, is_lower_hex_digit),
char('-'),
take_while_m_n(4, 4, is_lower_hex_digit),
char('-'),
take_while_m_n(4, 4, is_lower_hex_digit),
char('-'),
take_while_m_n(12, 12, is_lower_hex_digit),
)))(input)
}
fn hash_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Offset
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Slice<std::ops::RangeFrom<usize>>
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
// Size considerations:
// - sha-1 is git's original hash
// - sha-256 is git's new hash
// - Git hashes can be abbreviated but we need a good abbreviation that won't be mistaken
// for a variable name
const SHA_1_MAX: usize = 40;
const SHA_256_MAX: usize = 64;
take_while_m_n(SHA_1_MAX, SHA_256_MAX, is_lower_hex_digit)(input)
}
fn base64_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Offset
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Slice<std::ops::RangeFrom<usize>>
+ std::fmt::Debug
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
let (padding, captured) = take_while1(is_base64_digit)(input.clone())?;
if captured.input_len() < 90 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::LengthValue,
)));
}
const CHUNK: usize = 4;
let padding_offset = input.offset(&padding);
let mut padding_len = CHUNK - padding_offset % CHUNK;
if padding_len == CHUNK {
padding_len = 0;
}
let (after, _) = take_while_m_n(padding_len, padding_len, is_base64_padding)(padding)?;
let after_offset = input.offset(&after);
Ok(input.take_split(after_offset))
}
fn email_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Offset
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Slice<std::ops::RangeFrom<usize>>
+ std::fmt::Debug
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
recognize(tuple((
take_while1(is_localport_char),
char('@'),
take_while1(is_domain_char),
)))(input)
}
fn url_literal<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Offset
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Slice<std::ops::RangeFrom<usize>>
+ std::fmt::Debug
+ Clone,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
recognize(tuple((
opt(terminated(
take_while1(is_scheme_char),
// HACK: Technically you can skip `//` if you don't have a domain but that would
// get messy to support.
tuple((char(':'), char('/'), char('/'))),
)),
tuple((
opt(terminated(take_while1(is_localport_char), char('@'))),
take_while1(is_domain_char),
opt(preceded(char(':'), take_while1(AsChar::is_dec_digit))),
)),
char('/'),
// HACK: Too lazy to enumerate
take_while(is_localport_char),
)))(input)
}
fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
where
I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
F: nom::Parser<I, I, E>,
E: nom::error::ParseError<I>,
{
move |i: I| {
let mut current = i.clone();
loop {
match f.parse(current.clone()) {
Err(nom::Err::Error(_)) => {
let offset = i.offset(&current);
let (after, before) = i.take_split(offset);
return Ok((after, before));
}
Err(e) => {
return Err(e);
}
Ok((next, _)) => {
if next == current {
return Err(nom::Err::Error(E::from_error_kind(
i,
nom::error::ErrorKind::Many0,
)));
}
current = next;
}
}
}
}
}
#[inline]
fn is_dec_digit_with_sep(i: impl AsChar + Copy) -> bool {
i.is_dec_digit() || is_digit_sep(i.as_char())
}
#[inline]
fn is_hex_digit_with_sep(i: impl AsChar + Copy) -> bool {
i.is_hex_digit() || is_digit_sep(i.as_char())
}
#[inline]
fn is_lower_hex_digit(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
('a'..='f').contains(&c) || ('0'..='9').contains(&c)
}
#[inline]
fn is_base64_digit(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
('a'..='z').contains(&c)
|| ('A'..='Z').contains(&c)
|| ('0'..='9').contains(&c)
|| c == '+'
|| c == '/'
}
#[inline]
fn is_base64_padding(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
c == '='
}
#[inline]
fn is_localport_char(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
('a'..='z').contains(&c)
|| ('A'..='Z').contains(&c)
|| ('0'..='9').contains(&c)
|| "!#$%&'*+-/=?^_`{|}~().".find(c).is_some()
}
#[inline]
fn is_domain_char(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
('a'..='z').contains(&c)
|| ('A'..='Z').contains(&c)
|| ('0'..='9').contains(&c)
|| "-().".find(c).is_some()
}
#[inline]
fn is_scheme_char(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
('a'..='z').contains(&c) || ('0'..='9').contains(&c) || "+.-".find(c).is_some()
}
#[inline]
fn is_xid_continue(i: impl AsChar + Copy) -> bool {
let c = i.as_char();
unicode_xid::UnicodeXID::is_xid_continue(c)
}
#[inline]
fn is_digit_sep(chr: char) -> bool {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
chr == '_' || chr == '\''
} }
} }
mod unicode_parser { mod unicode_parser {
use super::parser::next_literal; use super::parser::next_identifier;
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> { pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) { std::iter::from_fn(move || match next_identifier(input) {
Ok((i, o)) => { Ok((i, o)) => {
input = i; input = i;
debug_assert_ne!(o, ""); debug_assert_ne!(o, "");
@ -267,10 +479,10 @@ mod unicode_parser {
} }
mod ascii_parser { mod ascii_parser {
use super::parser::next_literal; use super::parser::next_identifier;
pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> { pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) { std::iter::from_fn(move || match next_identifier(input) {
Ok((i, o)) => { Ok((i, o)) => {
input = i; input = i;
debug_assert_ne!(o, b""); debug_assert_ne!(o, b"");
@ -613,11 +825,8 @@ mod test {
} }
#[test] #[test]
fn tokenize_ignore_hex_enabled() { fn tokenize_ignore_hex() {
let parser = TokenizerBuilder::new() let parser = TokenizerBuilder::new().build();
.ignore_hex(true)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World"; let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
@ -631,17 +840,13 @@ mod test {
} }
#[test] #[test]
fn tokenize_ignore_hex_disabled() { fn tokenize_ignore_uuid() {
let parser = TokenizerBuilder::new() let parser = TokenizerBuilder::new().build();
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World"; let input = "Hello 123e4567-e89b-12d3-a456-426652340000 World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0xDEADBEEF", Case::None, 6), Identifier::new_unchecked("World", Case::None, 43),
Identifier::new_unchecked("World", Case::None, 17),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
@ -650,35 +855,88 @@ mod test {
} }
#[test] #[test]
fn tokenize_leading_digits_enabled() { fn tokenize_ignore_hash() {
let parser = TokenizerBuilder::new() let parser = TokenizerBuilder::new().build();
.ignore_hex(false)
.leading_digits(true) let input = "Hello 485865fd0412e40d041e861506bb3ac11a3a91e3 World";
.build(); let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", Case::None, 47),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_ignore_base64() {
let parser = TokenizerBuilder::new().build();
let input = "Good Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X1Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X122Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X12== Bye";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Good", Case::None, 0),
Identifier::new_unchecked("Bye", Case::None, 134),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_ignore_email() {
let parser = TokenizerBuilder::new().build();
let input = "Good example@example.com Bye";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Good", Case::None, 0),
Identifier::new_unchecked("Bye", Case::None, 25),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_ignore_min_url() {
let parser = TokenizerBuilder::new().build();
let input = "Good example.com/hello Bye";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Good", Case::None, 0),
Identifier::new_unchecked("Bye", Case::None, 23),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_ignore_max_url() {
let parser = TokenizerBuilder::new().build();
let input = "Good http://user@example.com:3142/hello?query=value&extra=two#fragment Bye";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Good", Case::None, 0),
Identifier::new_unchecked("Bye", Case::None, 71),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits() {
let parser = TokenizerBuilder::new().build();
let input = "Hello 0Hello 124 0xDEADBEEF World"; let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0Hello", Case::None, 6), Identifier::new_unchecked("0Hello", Case::None, 6),
Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
Identifier::new_unchecked("World", Case::None, 28),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits_disabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(false)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", Case::None, 28), Identifier::new_unchecked("World", Case::None, 28),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();

View file

@ -8,7 +8,12 @@
| Per-Lang Dict | Yes | ? | No | Yes | | Per-Lang Dict | Yes | ? | No | Yes |
| CamelCase | Yes | ? | No | Yes | | CamelCase | Yes | ? | No | Yes |
| snake_case | Yes | ? | No | Yes | | snake_case | Yes | ? | No | Yes |
| Ignore email | Yes | yes | No | No |
| Ignore url | Yes | yes | No | No |
| Ignore Hex | Yes | ? | No | Yes | | Ignore Hex | Yes | ? | No | Yes |
| Ignore UUID | Yes | ? | No | No |
| Ignore base64 | Yes | ? | No | No |
| Ignore SHAs | Yes | ? | No | No |
| C-Escapes | No ([#20][def-3]) | ? | No | Yes | | C-Escapes | No ([#20][def-3]) | ? | No | Yes |
| Encodings | UTF-8 / UTF-16 | ? | Auto | Auto | | Encodings | UTF-8 / UTF-16 | ? | Auto | Auto |
| Whole-project | Yes | Yes | Yes | No | | Whole-project | Yes | Yes | Yes | No |

View file

@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order)
| default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-filename | \- | bool | Verifying spelling in file names. |
| default.check-file | \- | bool | Verifying spelling in files. | | default.check-file | \- | bool | Verifying spelling in files. |
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |

View file

@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> {
tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
if !tokenizer_config.ignore_hex() {
log::warn!("`ignore-hex` is deprecated");
if !tokenizer_config.identifier_leading_digits() {
log::warn!("`identifier-leading-digits` is deprecated");
}
}
let tokenizer = typos::tokens::TokenizerBuilder::new() let tokenizer = typos::tokens::TokenizerBuilder::new()
.unicode(tokenizer_config.unicode()) .unicode(tokenizer_config.unicode())
.ignore_hex(tokenizer_config.ignore_hex())
.leading_digits(tokenizer_config.identifier_leading_digits())
.build(); .build();
let dict = crate::dict::BuiltIn::new(dict_config.locale()); let dict = crate::dict::BuiltIn::new(dict_config.locale());