mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-28 20:11:05 -05:00
perf(parser): Auto-detect unicode
For smaller, ascii-only content, this seems to be taking ~30% less time for parsing.
This commit is contained in:
parent
21231bfc4d
commit
ded90f2387
3 changed files with 6 additions and 2 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1479,6 +1479,7 @@ name = "typos"
|
||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"bstr",
|
||||||
"itertools 0.10.0",
|
"itertools 0.10.0",
|
||||||
"log",
|
"log",
|
||||||
"nom",
|
"nom",
|
||||||
|
|
|
@ -25,3 +25,4 @@ simdutf8 = "0.1.1"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
unicode-segmentation = "1.7.1"
|
unicode-segmentation = "1.7.1"
|
||||||
|
bstr = "0.2"
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
use bstr::ByteSlice;
|
||||||
|
|
||||||
/// Define rules for tokenizaing a buffer.
|
/// Define rules for tokenizaing a buffer.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct TokenizerBuilder {
|
pub struct TokenizerBuilder {
|
||||||
|
@ -67,7 +69,7 @@ impl Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
let iter = if self.unicode {
|
let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) {
|
||||||
itertools::Either::Left(unicode_parser::iter_literals(content))
|
itertools::Either::Left(unicode_parser::iter_literals(content))
|
||||||
} else {
|
} else {
|
||||||
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
|
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
|
||||||
|
@ -79,7 +81,7 @@ impl Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
let iter = if self.unicode {
|
let iter = if self.unicode && !ByteSlice::is_ascii(content) {
|
||||||
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
|
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
|
||||||
itertools::Either::Left(iter)
|
itertools::Either::Left(iter)
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in a new issue