diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 1426ac8..9c7399d 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -1,40 +1,93 @@ mod data; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -fn bench_tokenize(c: &mut Criterion) { - let mut group = c.benchmark_group("tokenize"); +fn bench_parse_str(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_str"); for (name, sample) in data::DATA { let len = sample.len(); - group.bench_with_input(BenchmarkId::new("ident(bytes)", name), &len, |b, _| { - let parser = typos::tokens::Tokenizer::new(); - b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); - }); - group.bench_with_input(BenchmarkId::new("ident(str)", name), &len, |b, _| { - let parser = typos::tokens::Tokenizer::new(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build(); b.iter(|| parser.parse_str(sample).last()); }); + group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new() + .unicode(false) + .build(); + b.iter(|| parser.parse_str(sample).last()); + }); + } + group.finish(); +} + +fn bench_parse_bytes(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_bytes"); + for (name, sample) in data::DATA { + let len = sample.len(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build(); + b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); + }); + group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new() + .unicode(false) + .build(); + b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); + }); + } + group.finish(); +} + +fn bench_split(c: &mut Criterion) { + let mut group = c.benchmark_group("split"); + for (name, sample) in data::DATA { + let len = sample.len(); + group.throughput(Throughput::Bytes(len as u64)); group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| { let symbol = typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0); b.iter(|| symbol.split().last()); }); - group.bench_with_input( - BenchmarkId::new("ident(bytes)+words", name), - &len, - |b, _| { - let parser = typos::tokens::Tokenizer::new(); - b.iter(|| { - parser - .parse_bytes(sample.as_bytes()) - .flat_map(|i| i.split()) - .last() - }); - }, - ); } group.finish(); } -criterion_group!(benches, bench_tokenize); +fn bench_parse_split(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_bytes+split"); + for (name, sample) in data::DATA { + let len = sample.len(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build(); + b.iter(|| { + parser + .parse_bytes(sample.as_bytes()) + .flat_map(|i| i.split()) + .last() + }); + }); + group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new() + .unicode(false) + .build(); + b.iter(|| { + parser + .parse_bytes(sample.as_bytes()) + .flat_map(|i| i.split()) + .last() + }); + }); + } + group.finish(); +} + +criterion_group!( + benches, + bench_parse_str, + bench_parse_bytes, + bench_split, + bench_parse_split +); criterion_main!(benches); diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index dfb1939..6bcabd1 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -1,6 +1,7 @@ /// Define rules for tokenizaing a buffer. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TokenizerBuilder { + unicode: bool, ignore_hex: bool, leading_digits: bool, } @@ -10,6 +11,12 @@ impl TokenizerBuilder { Default::default() } + /// Specify that unicode Identifiers are allowed. + pub fn unicode(&mut self, yes: bool) -> &mut Self { + self.unicode = yes; + self + } + /// Specify that hexadecimal numbers should be ignored. pub fn ignore_hex(&mut self, yes: bool) -> &mut Self { self.ignore_hex = yes; @@ -24,10 +31,12 @@ impl TokenizerBuilder { pub fn build(&self) -> Tokenizer { let TokenizerBuilder { + unicode, leading_digits, ignore_hex, } = self.clone(); Tokenizer { + unicode, leading_digits, ignore_hex, } @@ -37,6 +46,7 @@ impl TokenizerBuilder { impl Default for TokenizerBuilder { fn default() -> Self { Self { + unicode: true, leading_digits: false, ignore_hex: true, } @@ -46,6 +56,7 @@ impl Default for TokenizerBuilder { /// Extract Identifiers from a buffer. #[derive(Debug, Clone)] pub struct Tokenizer { + unicode: bool, leading_digits: bool, ignore_hex: bool, } @@ -56,18 +67,27 @@ impl Tokenizer { } pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { - parser::iter_literals(content).filter_map(move |identifier| { + let iter = if self.unicode { + itertools::Either::Left(unicode_parser::iter_literals(content)) + } else { + itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) + }; + iter.filter_map(move |identifier| { let offset = offset(content.as_bytes(), identifier.as_bytes()); self.transform(identifier, offset) }) } pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { - Utf8Chunks::new(content).flat_map(move |c| { - let chunk_offset = offset(content, c.as_bytes()); - self.parse_str(c).map(move |i| { - Identifier::new_unchecked(i.token(), i.case(), i.offset() + chunk_offset) - }) + let iter = if self.unicode { + let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); + itertools::Either::Left(iter) + } else { + itertools::Either::Right(ascii_parser::iter_literals(content)) + }; + iter.filter_map(move |identifier| { + let offset = offset(content, identifier.as_bytes()); + self.transform(identifier, offset) }) } @@ -176,7 +196,7 @@ fn is_hex_digit(chr: u8) -> bool { chr.is_ascii_hexdigit() } -mod parser { +mod unicode_parser { use nom::bytes::complete::*; use nom::sequence::*; use nom::IResult; @@ -209,6 +229,49 @@ mod parser { } } +mod ascii_parser { + use nom::bytes::complete::*; + use nom::sequence::*; + use nom::IResult; + + pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator { + std::iter::from_fn(move || match next_literal(input) { + Ok((i, o)) => { + input = i; + debug_assert_ne!(o, b""); + // This is safe because we've checked that the strings are a subset of ASCII + // characters. + let o = unsafe { std::str::from_utf8_unchecked(o) }; + Some(o) + } + _ => None, + }) + } + + fn next_literal(input: &[u8]) -> IResult<&[u8], &[u8]> { + preceded(literal_sep, identifier)(input) + } + + fn literal_sep(input: &[u8]) -> IResult<&[u8], &[u8]> { + take_till(|c: u8| is_continue(c))(input) + } + + fn identifier(input: &[u8]) -> IResult<&[u8], &[u8]> { + // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only + // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd + // or unexpected cases than strip off start characters to a word since we aren't doing a + // proper word boundary parse + take_while1(|c: u8| is_continue(c))(input) + } + + fn is_continue(c: u8) -> bool { + (b'a'..=b'z').contains(&c) + || (b'A'..=b'Z').contains(&c) + || (b'0'..=b'9').contains(&c) + || c == b'_' + } +} + /// A term composed of Words. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { diff --git a/docs/reference.md b/docs/reference.md index ccfa36a..cfede19 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -23,6 +23,7 @@ Configuration is read from the following (in precedence order) | default.binary | --binary | bool | Check binary files as text | | default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-file | \- | bool | Verifying spelling in files. | +| default.unicode | \- | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. | | default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | diff --git a/src/config.rs b/src/config.rs index b8ca31a..4e3ef1e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -230,6 +230,8 @@ impl EngineConfig { #[serde(deny_unknown_fields, default)] #[serde(rename_all = "kebab-case")] pub struct TokenizerConfig { + /// Allow unicode characters in identifiers (and not just ASCII) + pub unicode: Option, /// Do not check identifiers that appear to be hexadecimal values. pub ignore_hex: Option, /// Allow identifiers to start with digits, in addition to letters. @@ -240,12 +242,16 @@ impl TokenizerConfig { pub fn from_defaults() -> Self { let empty = Self::default(); Self { + unicode: Some(empty.unicode()), ignore_hex: Some(empty.ignore_hex()), identifier_leading_digits: Some(empty.identifier_leading_digits()), } } pub fn update(&mut self, source: &TokenizerConfig) { + if let Some(source) = source.unicode { + self.unicode = Some(source); + } if let Some(source) = source.ignore_hex { self.ignore_hex = Some(source); } @@ -254,6 +260,10 @@ impl TokenizerConfig { } } + pub fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } + pub fn ignore_hex(&self) -> bool { self.ignore_hex.unwrap_or(true) } diff --git a/src/policy.rs b/src/policy.rs index 725a8eb..020eeb0 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -217,6 +217,7 @@ impl<'s> ConfigEngine<'s> { let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); let tokenizer = typos::tokens::TokenizerBuilder::new() + .unicode(tokenizer_config.unicode()) .ignore_hex(tokenizer_config.ignore_hex()) .leading_digits(tokenizer_config.identifier_leading_digits()) .build();