mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-25 02:20:58 -05:00
perf: Speed up identifier splitting
Before ``` test process_code ... bench: 25,627 ns/iter (+/- 2,062) test process_corpus ... bench: 20,192,253 ns/iter (+/- 603,029) test process_empty ... bench: 7,418 ns/iter (+/- 707) test process_no_tokens ... bench: 8,788 ns/iter (+/- 1,065) test process_sherlock ... bench: 30,420 ns/iter (+/- 2,699) test process_single_token ... bench: 9,426 ns/iter (+/- 811) test symbol_split_lowercase_long ... bench: 2,763 ns/iter (+/- 246) test symbol_split_lowercase_short ... bench: 110 ns/iter (+/- 12) test symbol_split_mixed_long ... bench: 7,373 ns/iter (+/- 1,111) test symbol_split_mixed_short ... bench: 357 ns/iter (+/- 86) ``` After ``` test process_code ... bench: 20,973 ns/iter (+/- 1,717) test process_corpus ... bench: 15,826,059 ns/iter (+/- 1,016,628) test process_empty ... bench: 7,364 ns/iter (+/- 616) test process_no_tokens ... bench: 8,858 ns/iter (+/- 632) test process_sherlock ... bench: 24,707 ns/iter (+/- 3,482) test process_single_token ... bench: 9,339 ns/iter (+/- 706) test symbol_split_lowercase_long ... bench: 2,727 ns/iter (+/- 151) test symbol_split_lowercase_short ... bench: 46 ns/iter (+/- 2) test symbol_split_mixed_long ... bench: 5,753 ns/iter (+/- 441) test symbol_split_mixed_short ... bench: 76 ns/iter (+/- 3) ``` Fixes #33
This commit is contained in:
parent
2ae1a0bca6
commit
979b42ed6f
1 changed files with 89 additions and 61 deletions
|
@ -238,30 +238,50 @@ impl WordMode {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
|
struct SplitIdent<'s> {
|
||||||
let mut result = vec![];
|
ident: &'s str,
|
||||||
|
offset: usize,
|
||||||
|
|
||||||
let mut char_indices = ident.char_indices().peekable();
|
char_indices: std::iter::Peekable<std::str::CharIndices<'s>>,
|
||||||
let mut start = 0;
|
start: usize,
|
||||||
let mut start_mode = WordMode::Boundary;
|
start_mode: WordMode,
|
||||||
let mut last_mode = WordMode::Boundary;
|
last_mode: WordMode,
|
||||||
while let Some((i, c)) = char_indices.next() {
|
}
|
||||||
|
|
||||||
|
impl<'s> SplitIdent<'s> {
|
||||||
|
fn new(ident: &'s str, offset: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
ident,
|
||||||
|
offset,
|
||||||
|
char_indices: ident.char_indices().peekable(),
|
||||||
|
start: 0,
|
||||||
|
start_mode: WordMode::Boundary,
|
||||||
|
last_mode: WordMode::Boundary,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> Iterator for SplitIdent<'s> {
|
||||||
|
type Item = Word<'s>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Word<'s>> {
|
||||||
|
while let Some((i, c)) = self.char_indices.next() {
|
||||||
let cur_mode = WordMode::classify(c);
|
let cur_mode = WordMode::classify(c);
|
||||||
if cur_mode == WordMode::Boundary {
|
if cur_mode == WordMode::Boundary {
|
||||||
assert!(start_mode == WordMode::Boundary);
|
assert!(self.start_mode == WordMode::Boundary);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if start_mode == WordMode::Boundary {
|
if self.start_mode == WordMode::Boundary {
|
||||||
start_mode = cur_mode;
|
self.start_mode = cur_mode;
|
||||||
start = i;
|
self.start = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(&(next_i, next)) = char_indices.peek() {
|
if let Some(&(next_i, next)) = self.char_indices.peek() {
|
||||||
// The mode including the current character, assuming the current character does
|
// The mode including the current character, assuming the current character does
|
||||||
// not result in a word boundary.
|
// not result in a word boundary.
|
||||||
let next_mode = WordMode::classify(next);
|
let next_mode = WordMode::classify(next);
|
||||||
|
|
||||||
match (last_mode, cur_mode, next_mode) {
|
match (self.last_mode, cur_mode, next_mode) {
|
||||||
// cur_mode is last of current word
|
// cur_mode is last of current word
|
||||||
(_, _, WordMode::Boundary)
|
(_, _, WordMode::Boundary)
|
||||||
| (_, WordMode::Lowercase, WordMode::Number)
|
| (_, WordMode::Lowercase, WordMode::Number)
|
||||||
|
@ -269,41 +289,49 @@ fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
|
||||||
| (_, WordMode::Number, WordMode::Lowercase)
|
| (_, WordMode::Number, WordMode::Lowercase)
|
||||||
| (_, WordMode::Number, WordMode::Uppercase)
|
| (_, WordMode::Number, WordMode::Uppercase)
|
||||||
| (_, WordMode::Lowercase, WordMode::Uppercase) => {
|
| (_, WordMode::Lowercase, WordMode::Uppercase) => {
|
||||||
let case = start_mode.case(cur_mode);
|
let case = self.start_mode.case(cur_mode);
|
||||||
result.push(Word::new_unchecked(
|
let result = Word::new_unchecked(
|
||||||
&ident[start..next_i],
|
&self.ident[self.start..next_i],
|
||||||
case,
|
case,
|
||||||
start + offset,
|
self.start + self.offset,
|
||||||
));
|
);
|
||||||
start = next_i;
|
self.start = next_i;
|
||||||
start_mode = WordMode::Boundary;
|
self.start_mode = WordMode::Boundary;
|
||||||
last_mode = WordMode::Boundary;
|
self.last_mode = WordMode::Boundary;
|
||||||
|
return Some(result);
|
||||||
}
|
}
|
||||||
// cur_mode is start of next word
|
// cur_mode is start of next word
|
||||||
(WordMode::Uppercase, WordMode::Uppercase, WordMode::Lowercase) => {
|
(WordMode::Uppercase, WordMode::Uppercase, WordMode::Lowercase) => {
|
||||||
result.push(Word::new_unchecked(
|
let result = Word::new_unchecked(
|
||||||
&ident[start..i],
|
&self.ident[self.start..i],
|
||||||
Case::Scream,
|
Case::Scream,
|
||||||
start + offset,
|
self.start + self.offset,
|
||||||
));
|
);
|
||||||
start = i;
|
self.start = i;
|
||||||
start_mode = cur_mode;
|
self.start_mode = cur_mode;
|
||||||
last_mode = WordMode::Boundary;
|
self.last_mode = WordMode::Boundary;
|
||||||
|
return Some(result);
|
||||||
}
|
}
|
||||||
// No word boundary
|
// No word boundary
|
||||||
(_, _, _) => {
|
(_, _, _) => {
|
||||||
last_mode = cur_mode;
|
self.last_mode = cur_mode;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Collect trailing characters as a word
|
// Collect trailing characters as a word
|
||||||
let case = start_mode.case(cur_mode);
|
let case = self.start_mode.case(cur_mode);
|
||||||
result.push(Word::new_unchecked(&ident[start..], case, start + offset));
|
let result =
|
||||||
break;
|
Word::new_unchecked(&self.ident[self.start..], case, self.start + self.offset);
|
||||||
|
return Some(result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
result.into_iter()
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
|
||||||
|
SplitIdent::new(ident, offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
Loading…
Reference in a new issue