perf(parser): Auto-detect unicode

For smaller, ascii-only content, this seems to be taking ~30% less time
for parsing.
This commit is contained in:
Ed Page 2021-06-29 04:25:52 -05:00
parent 21231bfc4d
commit ded90f2387
3 changed files with 6 additions and 2 deletions

1
Cargo.lock generated
View file

@ -1479,6 +1479,7 @@ name = "typos"
version = "0.6.0" version = "0.6.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bstr",
"itertools 0.10.0", "itertools 0.10.0",
"log", "log",
"nom", "nom",

View file

@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
itertools = "0.10" itertools = "0.10"
log = "0.4" log = "0.4"
unicode-segmentation = "1.7.1" unicode-segmentation = "1.7.1"
bstr = "0.2"

View file

@ -1,3 +1,5 @@
use bstr::ByteSlice;
/// Define rules for tokenizaing a buffer. /// Define rules for tokenizaing a buffer.
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerBuilder { pub struct TokenizerBuilder {
@ -67,7 +69,7 @@ impl Tokenizer {
} }
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode { let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
itertools::Either::Left(unicode_parser::iter_literals(content)) itertools::Either::Left(unicode_parser::iter_literals(content))
} else { } else {
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
@ -79,7 +81,7 @@ impl Tokenizer {
} }
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
let iter = if self.unicode { let iter = if self.unicode && !ByteSlice::is_ascii(content) {
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
itertools::Either::Left(iter) itertools::Either::Left(iter)
} else { } else {