perf(dictgen): Move unicode check up in trie

This commit is contained in:
Ed Page 2024-12-30 11:27:34 -06:00
parent 083871447d
commit 016ae0c7c6

View file

@ -32,48 +32,50 @@ pub struct DictTrie<V: 'static> {
} }
impl<V> DictTrie<V> { impl<V> DictTrie<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> { pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) { if word.is_ascii() {
let bytes = word.as_bytes(); if self.range.contains(&word.len()) {
self.find_ascii(word.as_bytes())
} else {
None
}
} else {
self.unicode.find(word)
}
}
let mut child = &self.root; fn find_ascii(&self, word: &[u8]) -> Option<&'static V> {
for i in 0..bytes.len() { let mut child = &self.root;
match child.children { for i in 0..word.len() {
DictTrieChild::Nested(n) => { match child.children {
let byte = bytes[i]; DictTrieChild::Nested(n) => {
let index = if byte.is_ascii_lowercase() { let byte = word[i];
byte - b'a' let index = if byte.is_ascii_lowercase() {
} else if byte.is_ascii_uppercase() { byte - b'a'
byte - b'A' } else if byte.is_ascii_uppercase() {
} else { byte - b'A'
return self.unicode.find(word); } else {
}; return None;
debug_assert!(index < 26); };
if let Some(next) = n[index as usize].as_ref() { debug_assert!(index < 26);
child = next; if let Some(next) = n[index as usize].as_ref() {
} else { child = next;
return None; } else {
} return None;
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
} }
} }
DictTrieChild::Flat(t) => {
let remaining = &word[i..word.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
let remaining = unicase::UniCase::ascii(remaining);
return t.find(&remaining);
}
} }
child.value.as_ref()
} else {
None
} }
child.value.as_ref()
} }
} }