From 6b92e345cc893eba5dd2561093d367ff4b7a88a3 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 26 Apr 2021 18:08:52 -0500 Subject: [PATCH] perf(parser): Speed up UTF-8 validation --- Cargo.lock | 7 +++++++ crates/typos/Cargo.toml | 1 + crates/typos/src/tokens.rs | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 4a91983..e088a4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1318,6 +1318,12 @@ dependencies = [ "serde", ] +[[package]] +name = "simdutf8" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4f3d445e9015cf5e72cec4a3b3a84f8d54f34207afee609fd152de1c0212b1" + [[package]] name = "siphasher" version = "0.3.3" @@ -1496,6 +1502,7 @@ dependencies = [ "once_cell", "regex", "serde", + "simdutf8", "thiserror", "unicode-segmentation", ] diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index 0611d78..ae8a385 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -20,6 +20,7 @@ thiserror = "1.0" regex = "1.3" once_cell = "1.2.0" serde = { version = "1.0", features = ["derive"] } +simdutf8 = "0.1.1" itertools = "0.10" log = "0.4" unicode-segmentation = "1.7.1" diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 0286e68..20488ef 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -158,7 +158,7 @@ impl<'s> Iterator for Utf8Chunks<'s> { if self.source.is_empty() { return None; } - match std::str::from_utf8(self.source) { + match simdutf8::compat::from_utf8(self.source) { Ok(valid) => { self.source = b""; return Some(valid);