perf: Speed up identifier splitting

Before
```
test process_code         ... bench:      25,627 ns/iter (+/- 2,062)
test process_corpus       ... bench:  20,192,253 ns/iter (+/- 603,029)
test process_empty        ... bench:       7,418 ns/iter (+/- 707)
test process_no_tokens    ... bench:       8,788 ns/iter (+/- 1,065)
test process_sherlock     ... bench:      30,420 ns/iter (+/- 2,699)
test process_single_token ... bench:       9,426 ns/iter (+/- 811)
test symbol_split_lowercase_long  ... bench:       2,763 ns/iter (+/- 246)
test symbol_split_lowercase_short ... bench:         110 ns/iter (+/- 12)
test symbol_split_mixed_long      ... bench:       7,373 ns/iter (+/- 1,111)
test symbol_split_mixed_short     ... bench:         357 ns/iter (+/- 86)
```

After
```
test process_code         ... bench:      20,973 ns/iter (+/- 1,717)
test process_corpus       ... bench:  15,826,059 ns/iter (+/- 1,016,628)
test process_empty        ... bench:       7,364 ns/iter (+/- 616)
test process_no_tokens    ... bench:       8,858 ns/iter (+/- 632)
test process_sherlock     ... bench:      24,707 ns/iter (+/- 3,482)
test process_single_token ... bench:       9,339 ns/iter (+/- 706)
test symbol_split_lowercase_long  ... bench:       2,727 ns/iter (+/- 151)
test symbol_split_lowercase_short ... bench:          46 ns/iter (+/- 2)
test symbol_split_mixed_long      ... bench:       5,753 ns/iter (+/- 441)
test symbol_split_mixed_short     ... bench:          76 ns/iter (+/- 3)
```

Fixes #33
This commit is contained in:
Ed Page 2019-10-25 14:36:08 -06:00
parent 2ae1a0bca6
commit 979b42ed6f

View file

@ -238,30 +238,50 @@ impl WordMode {
} }
} }
fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> { struct SplitIdent<'s> {
let mut result = vec![]; ident: &'s str,
offset: usize,
let mut char_indices = ident.char_indices().peekable(); char_indices: std::iter::Peekable<std::str::CharIndices<'s>>,
let mut start = 0; start: usize,
let mut start_mode = WordMode::Boundary; start_mode: WordMode,
let mut last_mode = WordMode::Boundary; last_mode: WordMode,
while let Some((i, c)) = char_indices.next() { }
impl<'s> SplitIdent<'s> {
fn new(ident: &'s str, offset: usize) -> Self {
Self {
ident,
offset,
char_indices: ident.char_indices().peekable(),
start: 0,
start_mode: WordMode::Boundary,
last_mode: WordMode::Boundary,
}
}
}
impl<'s> Iterator for SplitIdent<'s> {
type Item = Word<'s>;
fn next(&mut self) -> Option<Word<'s>> {
while let Some((i, c)) = self.char_indices.next() {
let cur_mode = WordMode::classify(c); let cur_mode = WordMode::classify(c);
if cur_mode == WordMode::Boundary { if cur_mode == WordMode::Boundary {
assert!(start_mode == WordMode::Boundary); assert!(self.start_mode == WordMode::Boundary);
continue; continue;
} }
if start_mode == WordMode::Boundary { if self.start_mode == WordMode::Boundary {
start_mode = cur_mode; self.start_mode = cur_mode;
start = i; self.start = i;
} }
if let Some(&(next_i, next)) = char_indices.peek() { if let Some(&(next_i, next)) = self.char_indices.peek() {
// The mode including the current character, assuming the current character does // The mode including the current character, assuming the current character does
// not result in a word boundary. // not result in a word boundary.
let next_mode = WordMode::classify(next); let next_mode = WordMode::classify(next);
match (last_mode, cur_mode, next_mode) { match (self.last_mode, cur_mode, next_mode) {
// cur_mode is last of current word // cur_mode is last of current word
(_, _, WordMode::Boundary) (_, _, WordMode::Boundary)
| (_, WordMode::Lowercase, WordMode::Number) | (_, WordMode::Lowercase, WordMode::Number)
@ -269,41 +289,49 @@ fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
| (_, WordMode::Number, WordMode::Lowercase) | (_, WordMode::Number, WordMode::Lowercase)
| (_, WordMode::Number, WordMode::Uppercase) | (_, WordMode::Number, WordMode::Uppercase)
| (_, WordMode::Lowercase, WordMode::Uppercase) => { | (_, WordMode::Lowercase, WordMode::Uppercase) => {
let case = start_mode.case(cur_mode); let case = self.start_mode.case(cur_mode);
result.push(Word::new_unchecked( let result = Word::new_unchecked(
&ident[start..next_i], &self.ident[self.start..next_i],
case, case,
start + offset, self.start + self.offset,
)); );
start = next_i; self.start = next_i;
start_mode = WordMode::Boundary; self.start_mode = WordMode::Boundary;
last_mode = WordMode::Boundary; self.last_mode = WordMode::Boundary;
return Some(result);
} }
// cur_mode is start of next word // cur_mode is start of next word
(WordMode::Uppercase, WordMode::Uppercase, WordMode::Lowercase) => { (WordMode::Uppercase, WordMode::Uppercase, WordMode::Lowercase) => {
result.push(Word::new_unchecked( let result = Word::new_unchecked(
&ident[start..i], &self.ident[self.start..i],
Case::Scream, Case::Scream,
start + offset, self.start + self.offset,
)); );
start = i; self.start = i;
start_mode = cur_mode; self.start_mode = cur_mode;
last_mode = WordMode::Boundary; self.last_mode = WordMode::Boundary;
return Some(result);
} }
// No word boundary // No word boundary
(_, _, _) => { (_, _, _) => {
last_mode = cur_mode; self.last_mode = cur_mode;
} }
} }
} else { } else {
// Collect trailing characters as a word // Collect trailing characters as a word
let case = start_mode.case(cur_mode); let case = self.start_mode.case(cur_mode);
result.push(Word::new_unchecked(&ident[start..], case, start + offset)); let result =
break; Word::new_unchecked(&self.ident[self.start..], case, self.start + self.offset);
return Some(result);
} }
} }
result.into_iter() None
}
}
fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
SplitIdent::new(ident, offset)
} }
#[cfg(test)] #[cfg(test)]