From ded90f23874a93330516226e8498d3e6909c9342 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 04:25:52 -0500 Subject: [PATCH] perf(parser): Auto-detect unicode For smaller, ascii-only content, this seems to be taking ~30% less time for parsing. --- Cargo.lock | 1 + crates/typos/Cargo.toml | 1 + crates/typos/src/tokens.rs | 6 ++++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 25af32a..2118300 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1479,6 +1479,7 @@ name = "typos" version = "0.6.0" dependencies = [ "anyhow", + "bstr", "itertools 0.10.0", "log", "nom", diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index 35bc687..7f7343f 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -25,3 +25,4 @@ simdutf8 = "0.1.1" itertools = "0.10" log = "0.4" unicode-segmentation = "1.7.1" +bstr = "0.2" diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index a31c5d4..0d8f7a2 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -1,3 +1,5 @@ +use bstr::ByteSlice; + /// Define rules for tokenizaing a buffer. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TokenizerBuilder { @@ -67,7 +69,7 @@ impl Tokenizer { } pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { - let iter = if self.unicode { + let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) { itertools::Either::Left(unicode_parser::iter_literals(content)) } else { itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) @@ -79,7 +81,7 @@ impl Tokenizer { } pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { - let iter = if self.unicode { + let iter = if self.unicode && !ByteSlice::is_ascii(content) { let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); itertools::Either::Left(iter) } else {