perf(parser): Auto-detect unicode

For smaller, ascii-only content, this seems to be taking ~30% less time
for parsing.
This commit is contained in:
Ed Page 2021-06-29 04:25:52 -05:00
parent 21231bfc4d
commit ded90f2387
3 changed files with 6 additions and 2 deletions

1
Cargo.lock generated
View file

@ -1479,6 +1479,7 @@ name = "typos"
version = "0.6.0"
dependencies = [
"anyhow",
"bstr",
"itertools 0.10.0",
"log",
"nom",

View file

@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
itertools = "0.10"
log = "0.4"
unicode-segmentation = "1.7.1"
bstr = "0.2"

View file

@ -1,3 +1,5 @@
use bstr::ByteSlice;
/// Define rules for tokenizaing a buffer.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerBuilder {
@ -67,7 +69,7 @@ impl Tokenizer {
}
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode {
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
itertools::Either::Left(unicode_parser::iter_literals(content))
} else {
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
@ -79,7 +81,7 @@ impl Tokenizer {
}
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode {
let iter = if self.unicode && !ByteSlice::is_ascii(content) {
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
itertools::Either::Left(iter)
} else {