mirror of
https://github.com/crate-ci/typos.git
synced 2024-12-22 23:52:12 -05:00
Merge pull request #293 from epage/parse
Detect non-identifiers to ignore
This commit is contained in:
commit
effc21ed10
5 changed files with 447 additions and 161 deletions
20
CHANGELOG.md
20
CHANGELOG.md
|
@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
||||||
<!-- next-header -->
|
<!-- next-header -->
|
||||||
## [Unreleased] - ReleaseDate
|
## [Unreleased] - ReleaseDate
|
||||||
|
|
||||||
|
#### Change of Behavior
|
||||||
|
|
||||||
|
- `ignore-hex` and `identifier-leading-digit` are deprecated and `typos` acts as
|
||||||
|
if `ignore-hex=true` and `identifier-leading-digit=false`.
|
||||||
|
|
||||||
|
#### Features
|
||||||
|
|
||||||
|
- Automatically ignore
|
||||||
|
- UUIDs
|
||||||
|
- SHAs
|
||||||
|
- base64 encoded data (must be at least 90 bytes)
|
||||||
|
- emails
|
||||||
|
- URLs
|
||||||
|
|
||||||
|
#### Performance
|
||||||
|
|
||||||
|
- Due to new literal detection, finding identifiers is takes 10x longer.
|
||||||
|
Combined with word splitting, its only takes 3x longer. The majority of the
|
||||||
|
time is spent in dictionary lookups, so we don't expect this to have too much impact in the end.
|
||||||
|
|
||||||
## [1.0.10] - 2021-06-28
|
## [1.0.10] - 2021-06-28
|
||||||
|
|
||||||
#### Bug Fixes
|
#### Bug Fixes
|
||||||
|
|
|
@ -4,8 +4,6 @@ use bstr::ByteSlice;
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct TokenizerBuilder {
|
pub struct TokenizerBuilder {
|
||||||
unicode: bool,
|
unicode: bool,
|
||||||
ignore_hex: bool,
|
|
||||||
leading_digits: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenizerBuilder {
|
impl TokenizerBuilder {
|
||||||
|
@ -19,39 +17,15 @@ impl TokenizerBuilder {
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Specify that hexadecimal numbers should be ignored.
|
|
||||||
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
|
|
||||||
self.ignore_hex = yes;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Specify that leading digits are allowed for Identifiers.
|
|
||||||
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
|
|
||||||
self.leading_digits = yes;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(&self) -> Tokenizer {
|
pub fn build(&self) -> Tokenizer {
|
||||||
let TokenizerBuilder {
|
let TokenizerBuilder { unicode } = self.clone();
|
||||||
unicode,
|
Tokenizer { unicode }
|
||||||
leading_digits,
|
|
||||||
ignore_hex,
|
|
||||||
} = self.clone();
|
|
||||||
Tokenizer {
|
|
||||||
unicode,
|
|
||||||
leading_digits,
|
|
||||||
ignore_hex,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TokenizerBuilder {
|
impl Default for TokenizerBuilder {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self { unicode: true }
|
||||||
unicode: true,
|
|
||||||
leading_digits: false,
|
|
||||||
ignore_hex: true,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,8 +33,6 @@ impl Default for TokenizerBuilder {
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Tokenizer {
|
pub struct Tokenizer {
|
||||||
unicode: bool,
|
unicode: bool,
|
||||||
leading_digits: bool,
|
|
||||||
ignore_hex: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tokenizer {
|
impl Tokenizer {
|
||||||
|
@ -70,9 +42,9 @@ impl Tokenizer {
|
||||||
|
|
||||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
|
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
|
||||||
itertools::Either::Left(unicode_parser::iter_literals(content))
|
itertools::Either::Left(unicode_parser::iter_identifiers(content))
|
||||||
} else {
|
} else {
|
||||||
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
|
itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes()))
|
||||||
};
|
};
|
||||||
iter.filter_map(move |identifier| {
|
iter.filter_map(move |identifier| {
|
||||||
let offset = offset(content.as_bytes(), identifier.as_bytes());
|
let offset = offset(content.as_bytes(), identifier.as_bytes());
|
||||||
|
@ -82,10 +54,11 @@ impl Tokenizer {
|
||||||
|
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
let iter = if self.unicode && !ByteSlice::is_ascii(content) {
|
let iter = if self.unicode && !ByteSlice::is_ascii(content) {
|
||||||
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
|
let iter =
|
||||||
|
Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c));
|
||||||
itertools::Either::Left(iter)
|
itertools::Either::Left(iter)
|
||||||
} else {
|
} else {
|
||||||
itertools::Either::Right(ascii_parser::iter_literals(content))
|
itertools::Either::Right(ascii_parser::iter_identifiers(content))
|
||||||
};
|
};
|
||||||
iter.filter_map(move |identifier| {
|
iter.filter_map(move |identifier| {
|
||||||
let offset = offset(content, identifier.as_bytes());
|
let offset = offset(content, identifier.as_bytes());
|
||||||
|
@ -95,17 +68,6 @@ impl Tokenizer {
|
||||||
|
|
||||||
fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
|
fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
|
||||||
debug_assert!(!identifier.is_empty());
|
debug_assert!(!identifier.is_empty());
|
||||||
if self.leading_digits {
|
|
||||||
if is_number(identifier.as_bytes()) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.ignore_hex && is_hex(identifier.as_bytes()) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
} else if is_digit(identifier.as_bytes()[0]) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let case = Case::None;
|
let case = Case::None;
|
||||||
Some(Identifier::new_unchecked(identifier, case, offset))
|
Some(Identifier::new_unchecked(identifier, case, offset))
|
||||||
|
@ -164,98 +126,348 @@ impl<'s> Iterator for Utf8Chunks<'s> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_number(ident: &[u8]) -> bool {
|
|
||||||
ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_hex(ident: &[u8]) -> bool {
|
|
||||||
if ident.len() < 3 {
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
ident[0] == b'0'
|
|
||||||
&& ident[1] == b'x'
|
|
||||||
&& ident[2..]
|
|
||||||
.iter()
|
|
||||||
.all(|b| is_hex_digit(*b) || is_digit_sep(*b))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn is_digit(chr: u8) -> bool {
|
|
||||||
chr.is_ascii_digit()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn is_digit_sep(chr: u8) -> bool {
|
|
||||||
// `_`: number literal separator in Rust and other languages
|
|
||||||
// `'`: number literal separator in C++
|
|
||||||
chr == b'_' || chr == b'\''
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn is_hex_digit(chr: u8) -> bool {
|
|
||||||
chr.is_ascii_hexdigit()
|
|
||||||
}
|
|
||||||
|
|
||||||
mod parser {
|
mod parser {
|
||||||
|
use nom::branch::*;
|
||||||
use nom::bytes::complete::*;
|
use nom::bytes::complete::*;
|
||||||
|
use nom::character::complete::*;
|
||||||
|
use nom::combinator::*;
|
||||||
use nom::sequence::*;
|
use nom::sequence::*;
|
||||||
use nom::IResult;
|
use nom::{AsChar, IResult};
|
||||||
|
|
||||||
pub(crate) trait AsChar: nom::AsChar {
|
pub(crate) fn next_identifier<T>(input: T) -> IResult<T, T>
|
||||||
#[allow(clippy::wrong_self_convention)]
|
|
||||||
fn is_xid_continue(self) -> bool;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsChar for u8 {
|
|
||||||
fn is_xid_continue(self) -> bool {
|
|
||||||
(b'a'..=b'z').contains(&self)
|
|
||||||
|| (b'A'..=b'Z').contains(&self)
|
|
||||||
|| (b'0'..=b'9').contains(&self)
|
|
||||||
|| self == b'_'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsChar for char {
|
|
||||||
fn is_xid_continue(self) -> bool {
|
|
||||||
unicode_xid::UnicodeXID::is_xid_continue(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn next_literal<T>(input: T) -> IResult<T, T>
|
|
||||||
where
|
where
|
||||||
T: nom::InputTakeAtPosition,
|
T: nom::InputTakeAtPosition
|
||||||
<T as nom::InputTakeAtPosition>::Item: AsChar,
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Offset
|
||||||
|
+ Clone
|
||||||
|
+ PartialEq
|
||||||
|
+ std::fmt::Debug,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
{
|
{
|
||||||
preceded(literal_sep, identifier)(input)
|
preceded(ignore, identifier)(input)
|
||||||
}
|
|
||||||
|
|
||||||
fn literal_sep<T>(input: T) -> IResult<T, T>
|
|
||||||
where
|
|
||||||
T: nom::InputTakeAtPosition,
|
|
||||||
<T as nom::InputTakeAtPosition>::Item: AsChar,
|
|
||||||
{
|
|
||||||
take_till(AsChar::is_xid_continue)(input)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn identifier<T>(input: T) -> IResult<T, T>
|
fn identifier<T>(input: T) -> IResult<T, T>
|
||||||
where
|
where
|
||||||
T: nom::InputTakeAtPosition,
|
T: nom::InputTakeAtPosition,
|
||||||
<T as nom::InputTakeAtPosition>::Item: AsChar,
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
{
|
{
|
||||||
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
|
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
|
||||||
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
|
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
|
||||||
// or unexpected cases than strip off start characters to a word since we aren't doing a
|
// or unexpected cases than strip off start characters to a word since we aren't doing a
|
||||||
// proper word boundary parse
|
// proper word boundary parse
|
||||||
take_while1(AsChar::is_xid_continue)(input)
|
take_while1(is_xid_continue)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ignore<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Offset
|
||||||
|
+ Clone
|
||||||
|
+ PartialEq
|
||||||
|
+ std::fmt::Debug,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
take_many0(alt((
|
||||||
|
terminated(uuid_literal, sep1),
|
||||||
|
terminated(hash_literal, sep1),
|
||||||
|
terminated(hex_literal, sep1),
|
||||||
|
terminated(dec_literal, sep1),
|
||||||
|
terminated(base64_literal, sep1),
|
||||||
|
terminated(email_literal, sep1),
|
||||||
|
terminated(url_literal, sep1),
|
||||||
|
sep1,
|
||||||
|
)))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sep1<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
take_till1(is_xid_continue)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dec_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
take_while1(is_dec_digit_with_sep)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn hex_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ Clone,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
preceded(
|
||||||
|
pair(char('0'), alt((char('x'), char('X')))),
|
||||||
|
take_while1(is_hex_digit_with_sep),
|
||||||
|
)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn uuid_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Offset
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ Clone,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
recognize(tuple((
|
||||||
|
take_while_m_n(8, 8, is_lower_hex_digit),
|
||||||
|
char('-'),
|
||||||
|
take_while_m_n(4, 4, is_lower_hex_digit),
|
||||||
|
char('-'),
|
||||||
|
take_while_m_n(4, 4, is_lower_hex_digit),
|
||||||
|
char('-'),
|
||||||
|
take_while_m_n(4, 4, is_lower_hex_digit),
|
||||||
|
char('-'),
|
||||||
|
take_while_m_n(12, 12, is_lower_hex_digit),
|
||||||
|
)))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn hash_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Offset
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ Clone,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
// Size considerations:
|
||||||
|
// - sha-1 is git's original hash
|
||||||
|
// - sha-256 is git's new hash
|
||||||
|
// - Git hashes can be abbreviated but we need a good abbreviation that won't be mistaken
|
||||||
|
// for a variable name
|
||||||
|
const SHA_1_MAX: usize = 40;
|
||||||
|
const SHA_256_MAX: usize = 64;
|
||||||
|
take_while_m_n(SHA_1_MAX, SHA_256_MAX, is_lower_hex_digit)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn base64_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Offset
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ std::fmt::Debug
|
||||||
|
+ Clone,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
let (padding, captured) = take_while1(is_base64_digit)(input.clone())?;
|
||||||
|
if captured.input_len() < 90 {
|
||||||
|
return Err(nom::Err::Error(nom::error::Error::new(
|
||||||
|
input,
|
||||||
|
nom::error::ErrorKind::LengthValue,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
const CHUNK: usize = 4;
|
||||||
|
let padding_offset = input.offset(&padding);
|
||||||
|
let mut padding_len = CHUNK - padding_offset % CHUNK;
|
||||||
|
if padding_len == CHUNK {
|
||||||
|
padding_len = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (after, _) = take_while_m_n(padding_len, padding_len, is_base64_padding)(padding)?;
|
||||||
|
let after_offset = input.offset(&after);
|
||||||
|
Ok(input.take_split(after_offset))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn email_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Offset
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ std::fmt::Debug
|
||||||
|
+ Clone,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
recognize(tuple((
|
||||||
|
take_while1(is_localport_char),
|
||||||
|
char('@'),
|
||||||
|
take_while1(is_domain_char),
|
||||||
|
)))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn url_literal<T>(input: T) -> IResult<T, T>
|
||||||
|
where
|
||||||
|
T: nom::InputTakeAtPosition
|
||||||
|
+ nom::InputTake
|
||||||
|
+ nom::InputIter
|
||||||
|
+ nom::InputLength
|
||||||
|
+ nom::Offset
|
||||||
|
+ nom::Slice<std::ops::RangeTo<usize>>
|
||||||
|
+ nom::Slice<std::ops::RangeFrom<usize>>
|
||||||
|
+ std::fmt::Debug
|
||||||
|
+ Clone,
|
||||||
|
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
|
||||||
|
<T as nom::InputIter>::Item: AsChar + Copy,
|
||||||
|
{
|
||||||
|
recognize(tuple((
|
||||||
|
opt(terminated(
|
||||||
|
take_while1(is_scheme_char),
|
||||||
|
// HACK: Technically you can skip `//` if you don't have a domain but that would
|
||||||
|
// get messy to support.
|
||||||
|
tuple((char(':'), char('/'), char('/'))),
|
||||||
|
)),
|
||||||
|
tuple((
|
||||||
|
opt(terminated(take_while1(is_localport_char), char('@'))),
|
||||||
|
take_while1(is_domain_char),
|
||||||
|
opt(preceded(char(':'), take_while1(AsChar::is_dec_digit))),
|
||||||
|
)),
|
||||||
|
char('/'),
|
||||||
|
// HACK: Too lazy to enumerate
|
||||||
|
take_while(is_localport_char),
|
||||||
|
)))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
|
||||||
|
where
|
||||||
|
I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
|
||||||
|
F: nom::Parser<I, I, E>,
|
||||||
|
E: nom::error::ParseError<I>,
|
||||||
|
{
|
||||||
|
move |i: I| {
|
||||||
|
let mut current = i.clone();
|
||||||
|
loop {
|
||||||
|
match f.parse(current.clone()) {
|
||||||
|
Err(nom::Err::Error(_)) => {
|
||||||
|
let offset = i.offset(¤t);
|
||||||
|
let (after, before) = i.take_split(offset);
|
||||||
|
return Ok((after, before));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
Ok((next, _)) => {
|
||||||
|
if next == current {
|
||||||
|
return Err(nom::Err::Error(E::from_error_kind(
|
||||||
|
i,
|
||||||
|
nom::error::ErrorKind::Many0,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
current = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_dec_digit_with_sep(i: impl AsChar + Copy) -> bool {
|
||||||
|
i.is_dec_digit() || is_digit_sep(i.as_char())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_hex_digit_with_sep(i: impl AsChar + Copy) -> bool {
|
||||||
|
i.is_hex_digit() || is_digit_sep(i.as_char())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_lower_hex_digit(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
('a'..='f').contains(&c) || ('0'..='9').contains(&c)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_base64_digit(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
('a'..='z').contains(&c)
|
||||||
|
|| ('A'..='Z').contains(&c)
|
||||||
|
|| ('0'..='9').contains(&c)
|
||||||
|
|| c == '+'
|
||||||
|
|| c == '/'
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_base64_padding(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
c == '='
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_localport_char(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
('a'..='z').contains(&c)
|
||||||
|
|| ('A'..='Z').contains(&c)
|
||||||
|
|| ('0'..='9').contains(&c)
|
||||||
|
|| "!#$%&'*+-/=?^_`{|}~().".find(c).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_domain_char(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
('a'..='z').contains(&c)
|
||||||
|
|| ('A'..='Z').contains(&c)
|
||||||
|
|| ('0'..='9').contains(&c)
|
||||||
|
|| "-().".find(c).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_scheme_char(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
('a'..='z').contains(&c) || ('0'..='9').contains(&c) || "+.-".find(c).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_xid_continue(i: impl AsChar + Copy) -> bool {
|
||||||
|
let c = i.as_char();
|
||||||
|
unicode_xid::UnicodeXID::is_xid_continue(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_digit_sep(chr: char) -> bool {
|
||||||
|
// `_`: number literal separator in Rust and other languages
|
||||||
|
// `'`: number literal separator in C++
|
||||||
|
chr == '_' || chr == '\''
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mod unicode_parser {
|
mod unicode_parser {
|
||||||
use super::parser::next_literal;
|
use super::parser::next_identifier;
|
||||||
|
|
||||||
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
|
pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator<Item = &str> {
|
||||||
std::iter::from_fn(move || match next_literal(input) {
|
std::iter::from_fn(move || match next_identifier(input) {
|
||||||
Ok((i, o)) => {
|
Ok((i, o)) => {
|
||||||
input = i;
|
input = i;
|
||||||
debug_assert_ne!(o, "");
|
debug_assert_ne!(o, "");
|
||||||
|
@ -267,10 +479,10 @@ mod unicode_parser {
|
||||||
}
|
}
|
||||||
|
|
||||||
mod ascii_parser {
|
mod ascii_parser {
|
||||||
use super::parser::next_literal;
|
use super::parser::next_identifier;
|
||||||
|
|
||||||
pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
|
pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator<Item = &str> {
|
||||||
std::iter::from_fn(move || match next_literal(input) {
|
std::iter::from_fn(move || match next_identifier(input) {
|
||||||
Ok((i, o)) => {
|
Ok((i, o)) => {
|
||||||
input = i;
|
input = i;
|
||||||
debug_assert_ne!(o, b"");
|
debug_assert_ne!(o, b"");
|
||||||
|
@ -613,11 +825,8 @@ mod test {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_ignore_hex_enabled() {
|
fn tokenize_ignore_hex() {
|
||||||
let parser = TokenizerBuilder::new()
|
let parser = TokenizerBuilder::new().build();
|
||||||
.ignore_hex(true)
|
|
||||||
.leading_digits(true)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
let input = "Hello 0xDEADBEEF World";
|
let input = "Hello 0xDEADBEEF World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
|
@ -631,17 +840,13 @@ mod test {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_ignore_hex_disabled() {
|
fn tokenize_ignore_uuid() {
|
||||||
let parser = TokenizerBuilder::new()
|
let parser = TokenizerBuilder::new().build();
|
||||||
.ignore_hex(false)
|
|
||||||
.leading_digits(true)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
let input = "Hello 0xDEADBEEF World";
|
let input = "Hello 123e4567-e89b-12d3-a456-426652340000 World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("Hello", Case::None, 0),
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
|
Identifier::new_unchecked("World", Case::None, 43),
|
||||||
Identifier::new_unchecked("World", Case::None, 17),
|
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -650,35 +855,88 @@ mod test {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_leading_digits_enabled() {
|
fn tokenize_ignore_hash() {
|
||||||
let parser = TokenizerBuilder::new()
|
let parser = TokenizerBuilder::new().build();
|
||||||
.ignore_hex(false)
|
|
||||||
.leading_digits(true)
|
let input = "Hello 485865fd0412e40d041e861506bb3ac11a3a91e3 World";
|
||||||
.build();
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("World", Case::None, 47),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_ignore_base64() {
|
||||||
|
let parser = TokenizerBuilder::new().build();
|
||||||
|
|
||||||
|
let input = "Good Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X1Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X122Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X12== Bye";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Good", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("Bye", Case::None, 134),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_ignore_email() {
|
||||||
|
let parser = TokenizerBuilder::new().build();
|
||||||
|
|
||||||
|
let input = "Good example@example.com Bye";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Good", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("Bye", Case::None, 25),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_ignore_min_url() {
|
||||||
|
let parser = TokenizerBuilder::new().build();
|
||||||
|
|
||||||
|
let input = "Good example.com/hello Bye";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Good", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("Bye", Case::None, 23),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_ignore_max_url() {
|
||||||
|
let parser = TokenizerBuilder::new().build();
|
||||||
|
|
||||||
|
let input = "Good http://user@example.com:3142/hello?query=value&extra=two#fragment Bye";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Good", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("Bye", Case::None, 71),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_leading_digits() {
|
||||||
|
let parser = TokenizerBuilder::new().build();
|
||||||
|
|
||||||
let input = "Hello 0Hello 124 0xDEADBEEF World";
|
let input = "Hello 0Hello 124 0xDEADBEEF World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("Hello", Case::None, 0),
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
Identifier::new_unchecked("0Hello", Case::None, 6),
|
Identifier::new_unchecked("0Hello", Case::None, 6),
|
||||||
Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
|
|
||||||
Identifier::new_unchecked("World", Case::None, 28),
|
|
||||||
];
|
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
|
||||||
assert_eq!(expected, actual);
|
|
||||||
let actual: Vec<_> = parser.parse_str(input).collect();
|
|
||||||
assert_eq!(expected, actual);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn tokenize_leading_digits_disabled() {
|
|
||||||
let parser = TokenizerBuilder::new()
|
|
||||||
.ignore_hex(false)
|
|
||||||
.leading_digits(false)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
let input = "Hello 0Hello 124 0xDEADBEEF World";
|
|
||||||
let expected: Vec<Identifier> = vec![
|
|
||||||
Identifier::new_unchecked("Hello", Case::None, 0),
|
|
||||||
Identifier::new_unchecked("World", Case::None, 28),
|
Identifier::new_unchecked("World", Case::None, 28),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
|
|
@ -8,7 +8,12 @@
|
||||||
| Per-Lang Dict | Yes | ? | No | Yes |
|
| Per-Lang Dict | Yes | ? | No | Yes |
|
||||||
| CamelCase | Yes | ? | No | Yes |
|
| CamelCase | Yes | ? | No | Yes |
|
||||||
| snake_case | Yes | ? | No | Yes |
|
| snake_case | Yes | ? | No | Yes |
|
||||||
|
| Ignore email | Yes | yes | No | No |
|
||||||
|
| Ignore url | Yes | yes | No | No |
|
||||||
| Ignore Hex | Yes | ? | No | Yes |
|
| Ignore Hex | Yes | ? | No | Yes |
|
||||||
|
| Ignore UUID | Yes | ? | No | No |
|
||||||
|
| Ignore base64 | Yes | ? | No | No |
|
||||||
|
| Ignore SHAs | Yes | ? | No | No |
|
||||||
| C-Escapes | No ([#20][def-3]) | ? | No | Yes |
|
| C-Escapes | No ([#20][def-3]) | ? | No | Yes |
|
||||||
| Encodings | UTF-8 / UTF-16 | ? | Auto | Auto |
|
| Encodings | UTF-8 / UTF-16 | ? | Auto | Auto |
|
||||||
| Whole-project | Yes | Yes | Yes | No |
|
| Whole-project | Yes | Yes | Yes | No |
|
||||||
|
|
|
@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order)
|
||||||
| default.check-filename | \- | bool | Verifying spelling in file names. |
|
| default.check-filename | \- | bool | Verifying spelling in file names. |
|
||||||
| default.check-file | \- | bool | Verifying spelling in files. |
|
| default.check-file | \- | bool | Verifying spelling in files. |
|
||||||
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
|
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
|
||||||
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
|
|
||||||
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
|
|
||||||
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
||||||
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
||||||
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
||||||
|
|
|
@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> {
|
||||||
tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
|
tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults);
|
||||||
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
|
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
|
||||||
|
|
||||||
|
if !tokenizer_config.ignore_hex() {
|
||||||
|
log::warn!("`ignore-hex` is deprecated");
|
||||||
|
if !tokenizer_config.identifier_leading_digits() {
|
||||||
|
log::warn!("`identifier-leading-digits` is deprecated");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
||||||
.unicode(tokenizer_config.unicode())
|
.unicode(tokenizer_config.unicode())
|
||||||
.ignore_hex(tokenizer_config.ignore_hex())
|
|
||||||
.leading_digits(tokenizer_config.identifier_leading_digits())
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
let dict = crate::dict::BuiltIn::new(dict_config.locale());
|
let dict = crate::dict::BuiltIn::new(dict_config.locale());
|
||||||
|
|
Loading…
Reference in a new issue