mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-25 10:31:02 -05:00
perf(parser): Allow people to bypass unicode cost
This commit is contained in:
parent
09d2124d0f
commit
517da7ecd2
5 changed files with 158 additions and 30 deletions
|
@ -1,40 +1,93 @@
|
||||||
mod data;
|
mod data;
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||||
|
|
||||||
fn bench_tokenize(c: &mut Criterion) {
|
fn bench_parse_str(c: &mut Criterion) {
|
||||||
let mut group = c.benchmark_group("tokenize");
|
let mut group = c.benchmark_group("parse_str");
|
||||||
for (name, sample) in data::DATA {
|
for (name, sample) in data::DATA {
|
||||||
let len = sample.len();
|
let len = sample.len();
|
||||||
group.bench_with_input(BenchmarkId::new("ident(bytes)", name), &len, |b, _| {
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
let parser = typos::tokens::Tokenizer::new();
|
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
|
||||||
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
|
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
|
||||||
});
|
|
||||||
group.bench_with_input(BenchmarkId::new("ident(str)", name), &len, |b, _| {
|
|
||||||
let parser = typos::tokens::Tokenizer::new();
|
|
||||||
b.iter(|| parser.parse_str(sample).last());
|
b.iter(|| parser.parse_str(sample).last());
|
||||||
});
|
});
|
||||||
|
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(false)
|
||||||
|
.build();
|
||||||
|
b.iter(|| parser.parse_str(sample).last());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_parse_bytes(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("parse_bytes");
|
||||||
|
for (name, sample) in data::DATA {
|
||||||
|
let len = sample.len();
|
||||||
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
|
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
|
||||||
|
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
|
||||||
|
});
|
||||||
|
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(false)
|
||||||
|
.build();
|
||||||
|
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_split(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("split");
|
||||||
|
for (name, sample) in data::DATA {
|
||||||
|
let len = sample.len();
|
||||||
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
|
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
|
||||||
let symbol =
|
let symbol =
|
||||||
typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0);
|
typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0);
|
||||||
b.iter(|| symbol.split().last());
|
b.iter(|| symbol.split().last());
|
||||||
});
|
});
|
||||||
group.bench_with_input(
|
|
||||||
BenchmarkId::new("ident(bytes)+words", name),
|
|
||||||
&len,
|
|
||||||
|b, _| {
|
|
||||||
let parser = typos::tokens::Tokenizer::new();
|
|
||||||
b.iter(|| {
|
|
||||||
parser
|
|
||||||
.parse_bytes(sample.as_bytes())
|
|
||||||
.flat_map(|i| i.split())
|
|
||||||
.last()
|
|
||||||
});
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
group.finish();
|
group.finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(benches, bench_tokenize);
|
fn bench_parse_split(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("parse_bytes+split");
|
||||||
|
for (name, sample) in data::DATA {
|
||||||
|
let len = sample.len();
|
||||||
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
|
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
|
||||||
|
b.iter(|| {
|
||||||
|
parser
|
||||||
|
.parse_bytes(sample.as_bytes())
|
||||||
|
.flat_map(|i| i.split())
|
||||||
|
.last()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(false)
|
||||||
|
.build();
|
||||||
|
b.iter(|| {
|
||||||
|
parser
|
||||||
|
.parse_bytes(sample.as_bytes())
|
||||||
|
.flat_map(|i| i.split())
|
||||||
|
.last()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(
|
||||||
|
benches,
|
||||||
|
bench_parse_str,
|
||||||
|
bench_parse_bytes,
|
||||||
|
bench_split,
|
||||||
|
bench_parse_split
|
||||||
|
);
|
||||||
criterion_main!(benches);
|
criterion_main!(benches);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
/// Define rules for tokenizaing a buffer.
|
/// Define rules for tokenizaing a buffer.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct TokenizerBuilder {
|
pub struct TokenizerBuilder {
|
||||||
|
unicode: bool,
|
||||||
ignore_hex: bool,
|
ignore_hex: bool,
|
||||||
leading_digits: bool,
|
leading_digits: bool,
|
||||||
}
|
}
|
||||||
|
@ -10,6 +11,12 @@ impl TokenizerBuilder {
|
||||||
Default::default()
|
Default::default()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Specify that unicode Identifiers are allowed.
|
||||||
|
pub fn unicode(&mut self, yes: bool) -> &mut Self {
|
||||||
|
self.unicode = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Specify that hexadecimal numbers should be ignored.
|
/// Specify that hexadecimal numbers should be ignored.
|
||||||
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
|
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
|
||||||
self.ignore_hex = yes;
|
self.ignore_hex = yes;
|
||||||
|
@ -24,10 +31,12 @@ impl TokenizerBuilder {
|
||||||
|
|
||||||
pub fn build(&self) -> Tokenizer {
|
pub fn build(&self) -> Tokenizer {
|
||||||
let TokenizerBuilder {
|
let TokenizerBuilder {
|
||||||
|
unicode,
|
||||||
leading_digits,
|
leading_digits,
|
||||||
ignore_hex,
|
ignore_hex,
|
||||||
} = self.clone();
|
} = self.clone();
|
||||||
Tokenizer {
|
Tokenizer {
|
||||||
|
unicode,
|
||||||
leading_digits,
|
leading_digits,
|
||||||
ignore_hex,
|
ignore_hex,
|
||||||
}
|
}
|
||||||
|
@ -37,6 +46,7 @@ impl TokenizerBuilder {
|
||||||
impl Default for TokenizerBuilder {
|
impl Default for TokenizerBuilder {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
unicode: true,
|
||||||
leading_digits: false,
|
leading_digits: false,
|
||||||
ignore_hex: true,
|
ignore_hex: true,
|
||||||
}
|
}
|
||||||
|
@ -46,6 +56,7 @@ impl Default for TokenizerBuilder {
|
||||||
/// Extract Identifiers from a buffer.
|
/// Extract Identifiers from a buffer.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Tokenizer {
|
pub struct Tokenizer {
|
||||||
|
unicode: bool,
|
||||||
leading_digits: bool,
|
leading_digits: bool,
|
||||||
ignore_hex: bool,
|
ignore_hex: bool,
|
||||||
}
|
}
|
||||||
|
@ -56,18 +67,27 @@ impl Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
parser::iter_literals(content).filter_map(move |identifier| {
|
let iter = if self.unicode {
|
||||||
|
itertools::Either::Left(unicode_parser::iter_literals(content))
|
||||||
|
} else {
|
||||||
|
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
|
||||||
|
};
|
||||||
|
iter.filter_map(move |identifier| {
|
||||||
let offset = offset(content.as_bytes(), identifier.as_bytes());
|
let offset = offset(content.as_bytes(), identifier.as_bytes());
|
||||||
self.transform(identifier, offset)
|
self.transform(identifier, offset)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
Utf8Chunks::new(content).flat_map(move |c| {
|
let iter = if self.unicode {
|
||||||
let chunk_offset = offset(content, c.as_bytes());
|
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
|
||||||
self.parse_str(c).map(move |i| {
|
itertools::Either::Left(iter)
|
||||||
Identifier::new_unchecked(i.token(), i.case(), i.offset() + chunk_offset)
|
} else {
|
||||||
})
|
itertools::Either::Right(ascii_parser::iter_literals(content))
|
||||||
|
};
|
||||||
|
iter.filter_map(move |identifier| {
|
||||||
|
let offset = offset(content, identifier.as_bytes());
|
||||||
|
self.transform(identifier, offset)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,7 +196,7 @@ fn is_hex_digit(chr: u8) -> bool {
|
||||||
chr.is_ascii_hexdigit()
|
chr.is_ascii_hexdigit()
|
||||||
}
|
}
|
||||||
|
|
||||||
mod parser {
|
mod unicode_parser {
|
||||||
use nom::bytes::complete::*;
|
use nom::bytes::complete::*;
|
||||||
use nom::sequence::*;
|
use nom::sequence::*;
|
||||||
use nom::IResult;
|
use nom::IResult;
|
||||||
|
@ -209,6 +229,49 @@ mod parser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod ascii_parser {
|
||||||
|
use nom::bytes::complete::*;
|
||||||
|
use nom::sequence::*;
|
||||||
|
use nom::IResult;
|
||||||
|
|
||||||
|
pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
|
||||||
|
std::iter::from_fn(move || match next_literal(input) {
|
||||||
|
Ok((i, o)) => {
|
||||||
|
input = i;
|
||||||
|
debug_assert_ne!(o, b"");
|
||||||
|
// This is safe because we've checked that the strings are a subset of ASCII
|
||||||
|
// characters.
|
||||||
|
let o = unsafe { std::str::from_utf8_unchecked(o) };
|
||||||
|
Some(o)
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_literal(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||||
|
preceded(literal_sep, identifier)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn literal_sep(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||||
|
take_till(|c: u8| is_continue(c))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn identifier(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||||
|
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
|
||||||
|
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
|
||||||
|
// or unexpected cases than strip off start characters to a word since we aren't doing a
|
||||||
|
// proper word boundary parse
|
||||||
|
take_while1(|c: u8| is_continue(c))(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_continue(c: u8) -> bool {
|
||||||
|
(b'a'..=b'z').contains(&c)
|
||||||
|
|| (b'A'..=b'Z').contains(&c)
|
||||||
|
|| (b'0'..=b'9').contains(&c)
|
||||||
|
|| c == b'_'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A term composed of Words.
|
/// A term composed of Words.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub struct Identifier<'t> {
|
pub struct Identifier<'t> {
|
||||||
|
|
|
@ -23,6 +23,7 @@ Configuration is read from the following (in precedence order)
|
||||||
| default.binary | --binary | bool | Check binary files as text |
|
| default.binary | --binary | bool | Check binary files as text |
|
||||||
| default.check-filename | \- | bool | Verifying spelling in file names. |
|
| default.check-filename | \- | bool | Verifying spelling in file names. |
|
||||||
| default.check-file | \- | bool | Verifying spelling in files. |
|
| default.check-file | \- | bool | Verifying spelling in files. |
|
||||||
|
| default.unicode | \- | bool | Allow unicode characters in identifiers (and not just ASCII) |
|
||||||
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
|
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
|
||||||
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
|
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
|
||||||
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
||||||
|
|
|
@ -230,6 +230,8 @@ impl EngineConfig {
|
||||||
#[serde(deny_unknown_fields, default)]
|
#[serde(deny_unknown_fields, default)]
|
||||||
#[serde(rename_all = "kebab-case")]
|
#[serde(rename_all = "kebab-case")]
|
||||||
pub struct TokenizerConfig {
|
pub struct TokenizerConfig {
|
||||||
|
/// Allow unicode characters in identifiers (and not just ASCII)
|
||||||
|
pub unicode: Option<bool>,
|
||||||
/// Do not check identifiers that appear to be hexadecimal values.
|
/// Do not check identifiers that appear to be hexadecimal values.
|
||||||
pub ignore_hex: Option<bool>,
|
pub ignore_hex: Option<bool>,
|
||||||
/// Allow identifiers to start with digits, in addition to letters.
|
/// Allow identifiers to start with digits, in addition to letters.
|
||||||
|
@ -240,12 +242,16 @@ impl TokenizerConfig {
|
||||||
pub fn from_defaults() -> Self {
|
pub fn from_defaults() -> Self {
|
||||||
let empty = Self::default();
|
let empty = Self::default();
|
||||||
Self {
|
Self {
|
||||||
|
unicode: Some(empty.unicode()),
|
||||||
ignore_hex: Some(empty.ignore_hex()),
|
ignore_hex: Some(empty.ignore_hex()),
|
||||||
identifier_leading_digits: Some(empty.identifier_leading_digits()),
|
identifier_leading_digits: Some(empty.identifier_leading_digits()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update(&mut self, source: &TokenizerConfig) {
|
pub fn update(&mut self, source: &TokenizerConfig) {
|
||||||
|
if let Some(source) = source.unicode {
|
||||||
|
self.unicode = Some(source);
|
||||||
|
}
|
||||||
if let Some(source) = source.ignore_hex {
|
if let Some(source) = source.ignore_hex {
|
||||||
self.ignore_hex = Some(source);
|
self.ignore_hex = Some(source);
|
||||||
}
|
}
|
||||||
|
@ -254,6 +260,10 @@ impl TokenizerConfig {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn unicode(&self) -> bool {
|
||||||
|
self.unicode.unwrap_or(true)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn ignore_hex(&self) -> bool {
|
pub fn ignore_hex(&self) -> bool {
|
||||||
self.ignore_hex.unwrap_or(true)
|
self.ignore_hex.unwrap_or(true)
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,6 +217,7 @@ impl<'s> ConfigEngine<'s> {
|
||||||
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
|
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
|
||||||
|
|
||||||
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(tokenizer_config.unicode())
|
||||||
.ignore_hex(tokenizer_config.ignore_hex())
|
.ignore_hex(tokenizer_config.ignore_hex())
|
||||||
.leading_digits(tokenizer_config.identifier_leading_digits())
|
.leading_digits(tokenizer_config.identifier_leading_digits())
|
||||||
.build();
|
.build();
|
||||||
|
|
Loading…
Reference in a new issue