From ded90f23874a93330516226e8498d3e6909c9342 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Tue, 29 Jun 2021 04:25:52 -0500
Subject: [PATCH] perf(parser): Auto-detect unicode

For smaller, ascii-only content, this seems to be taking ~30% less time
for parsing.
---
 Cargo.lock                 | 1 +
 crates/typos/Cargo.toml    | 1 +
 crates/typos/src/tokens.rs | 6 ++++--
 3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 25af32a..2118300 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1479,6 +1479,7 @@ name = "typos"
 version = "0.6.0"
 dependencies = [
  "anyhow",
+ "bstr",
  "itertools 0.10.0",
  "log",
  "nom",
diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml
index 35bc687..7f7343f 100644
--- a/crates/typos/Cargo.toml
+++ b/crates/typos/Cargo.toml
@@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
 itertools = "0.10"
 log = "0.4"
 unicode-segmentation = "1.7.1"
+bstr = "0.2"
diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs
index a31c5d4..0d8f7a2 100644
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@@ -1,3 +1,5 @@
+use bstr::ByteSlice;
+
 /// Define rules for tokenizaing a buffer.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TokenizerBuilder {
@@ -67,7 +69,7 @@ impl Tokenizer {
     }
 
     pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
-        let iter = if self.unicode {
+        let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
             itertools::Either::Left(unicode_parser::iter_literals(content))
         } else {
             itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
@@ -79,7 +81,7 @@ impl Tokenizer {
     }
 
     pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
-        let iter = if self.unicode {
+        let iter = if self.unicode && !ByteSlice::is_ascii(content) {
             let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
             itertools::Either::Left(iter)
         } else {