mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-24 18:10:56 -05:00
perf(parser): Auto-detect unicode
For smaller, ascii-only content, this seems to be taking ~30% less time for parsing.
This commit is contained in:
parent
21231bfc4d
commit
ded90f2387
3 changed files with 6 additions and 2 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1479,6 +1479,7 @@ name = "typos"
|
|||
version = "0.6.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bstr",
|
||||
"itertools 0.10.0",
|
||||
"log",
|
||||
"nom",
|
||||
|
|
|
@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
|
|||
itertools = "0.10"
|
||||
log = "0.4"
|
||||
unicode-segmentation = "1.7.1"
|
||||
bstr = "0.2"
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
use bstr::ByteSlice;
|
||||
|
||||
/// Define rules for tokenizaing a buffer.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct TokenizerBuilder {
|
||||
|
@ -67,7 +69,7 @@ impl Tokenizer {
|
|||
}
|
||||
|
||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||
let iter = if self.unicode {
|
||||
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
|
||||
itertools::Either::Left(unicode_parser::iter_literals(content))
|
||||
} else {
|
||||
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
|
||||
|
@ -79,7 +81,7 @@ impl Tokenizer {
|
|||
}
|
||||
|
||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||
let iter = if self.unicode {
|
||||
let iter = if self.unicode && !ByteSlice::is_ascii(content) {
|
||||
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
|
||||
itertools::Either::Left(iter)
|
||||
} else {
|
||||
|
|
Loading…
Reference in a new issue