perf(parser): Auto-detect unicode

For smaller, ascii-only content, this seems to be taking ~30% less time for parsing.
2025-01-26 16:39:07 -05:00 · 2021-06-29 04:25:52 -05:00 · 2021-06-29 04:25:52 -05:00 · ded90f2387
commit ded90f2387
parent 21231bfc4d
3 changed files with 6 additions and 2 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1479,6 +1479,7 @@ name = "typos"
 version = "0.6.0"
 dependencies = [
 "anyhow",
+ "bstr",
 "itertools 0.10.0",
 "log",
 "nom",
--- a/crates/typos/Cargo.toml
+++ b/crates/typos/Cargo.toml
@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
 itertools = "0.10"
 log = "0.4"
 unicode-segmentation = "1.7.1"
+bstr = "0.2"
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@ -1,3 +1,5 @@
+use bstr::ByteSlice;
+
 /// Define rules for tokenizaing a buffer.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TokenizerBuilder {
@ -67,7 +69,7 @@ impl Tokenizer {
    }

    pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
-        let iter = if self.unicode {
+        let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
            itertools::Either::Left(unicode_parser::iter_literals(content))
        } else {
            itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
@ -79,7 +81,7 @@ impl Tokenizer {
    }

    pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
-        let iter = if self.unicode {
+        let iter = if self.unicode && !ByteSlice::is_ascii(content) {
            let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
            itertools::Either::Left(iter)
        } else {