perf(parser): Speed up UTF-8 validation

This commit is contained in:
Ed Page 2021-04-26 18:08:52 -05:00
parent 819702c82f
commit 6b92e345cc
3 changed files with 9 additions and 1 deletions

7
Cargo.lock generated
View file

@ -1318,6 +1318,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "simdutf8"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f4f3d445e9015cf5e72cec4a3b3a84f8d54f34207afee609fd152de1c0212b1"
[[package]] [[package]]
name = "siphasher" name = "siphasher"
version = "0.3.3" version = "0.3.3"
@ -1496,6 +1502,7 @@ dependencies = [
"once_cell", "once_cell",
"regex", "regex",
"serde", "serde",
"simdutf8",
"thiserror", "thiserror",
"unicode-segmentation", "unicode-segmentation",
] ]

View file

@ -20,6 +20,7 @@ thiserror = "1.0"
regex = "1.3" regex = "1.3"
once_cell = "1.2.0" once_cell = "1.2.0"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
simdutf8 = "0.1.1"
itertools = "0.10" itertools = "0.10"
log = "0.4" log = "0.4"
unicode-segmentation = "1.7.1" unicode-segmentation = "1.7.1"

View file

@ -158,7 +158,7 @@ impl<'s> Iterator for Utf8Chunks<'s> {
if self.source.is_empty() { if self.source.is_empty() {
return None; return None;
} }
match std::str::from_utf8(self.source) { match simdutf8::compat::from_utf8(self.source) {
Ok(valid) => { Ok(valid) => {
self.source = b""; self.source = b"";
return Some(valid); return Some(valid);