perf(dictgen): Remove ascii/unicode branch from trie

This commit is contained in:
Ed Page 2024-12-30 16:13:35 -06:00
parent e7ff9cfc01
commit 534e3c5f71
7 changed files with 160323 additions and 159956 deletions

View file

@ -40,7 +40,11 @@ impl<'g> DictGen<'g> {
} }
pub fn ordered_map(self) -> crate::OrderedMapGen<'g> { pub fn ordered_map(self) -> crate::OrderedMapGen<'g> {
crate::OrderedMapGen { gen: self } crate::OrderedMapGen {
gen: self,
unicode: true,
unicase: true,
}
} }
pub fn trie(self) -> crate::TrieGen<'g> { pub fn trie(self) -> crate::TrieGen<'g> {

View file

@ -113,3 +113,97 @@ impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a
self self
} }
} }
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub struct InsensitiveAscii<'s>(pub &'s str);
impl<'s> InsensitiveAscii<'s> {
pub fn convert(self) -> unicase::Ascii<&'s str> {
unicase::Ascii::new(self.0)
}
pub fn into_inner(self) -> &'s str {
self.0
}
pub fn is_empty(self) -> bool {
self.0.is_empty()
}
pub fn len(self) -> usize {
self.0.len()
}
}
impl<'s> From<unicase::Ascii<&'s str>> for InsensitiveAscii<'s> {
fn from(other: unicase::Ascii<&'s str>) -> Self {
Self(other.into_inner())
}
}
impl<'s2> PartialEq<InsensitiveAscii<'s2>> for InsensitiveAscii<'_> {
#[inline]
fn eq(&self, other: &InsensitiveAscii<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl Eq for InsensitiveAscii<'_> {}
impl PartialOrd for InsensitiveAscii<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for InsensitiveAscii<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.convert().cmp(&other.convert())
}
}
impl core::hash::Hash for InsensitiveAscii<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}
impl core::fmt::Debug for InsensitiveAscii<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl core::fmt::Display for InsensitiveAscii<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}
#[cfg(feature = "map")]
impl phf_shared::PhfHash for InsensitiveAscii<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}
#[cfg(feature = "map")]
impl phf_shared::FmtConst for InsensitiveAscii<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str("dictgen::InsensitiveAscii(")?;
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
#[cfg(feature = "map")]
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveAscii<'b>> for InsensitiveAscii<'a> {
fn borrow(&self) -> &InsensitiveAscii<'b> {
self
}
}

View file

@ -1,10 +1,22 @@
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
pub struct OrderedMapGen<'g> { pub struct OrderedMapGen<'g> {
pub(crate) gen: crate::DictGen<'g>, pub(crate) gen: crate::DictGen<'g>,
pub(crate) unicase: bool,
pub(crate) unicode: bool,
} }
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
impl OrderedMapGen<'_> { impl OrderedMapGen<'_> {
pub fn unicase(mut self, yes: bool) -> Self {
self.unicase = yes;
self
}
pub fn unicode(mut self, yes: bool) -> Self {
self.unicode = yes;
self
}
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>( pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
&self, &self,
file: &mut W, file: &mut W,
@ -14,7 +26,7 @@ impl OrderedMapGen<'_> {
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0)); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
let name = self.gen.name; let name = self.gen.name;
let key_type = "dictgen::InsensitiveStr<'static>"; let key_type = self.key_type();
let value_type = self.gen.value_type; let value_type = self.gen.value_type;
let mut smallest = usize::MAX; let mut smallest = usize::MAX;
@ -29,11 +41,7 @@ impl OrderedMapGen<'_> {
smallest = std::cmp::min(smallest, key.len()); smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len()); largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() { let key = self.key_new(key);
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?; writeln!(file, " {key},")?;
} }
@ -51,6 +59,28 @@ impl OrderedMapGen<'_> {
Ok(()) Ok(())
} }
fn key_type(&self) -> &'static str {
match (self.unicase, self.unicode) {
(true, true) => "dictgen::InsensitiveStr<'static>",
(true, false) => "dictgen::InsensitiveAscii<'static>",
(false, _) => "&'static str",
}
}
fn key_new(&self, key: &str) -> String {
match (self.unicase, self.unicode) {
(true, true) => {
if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
}
}
(true, false) => format!("dictgen::InsensitiveAscii({key:?})"),
(false, _) => format!("{key:?}"),
}
}
} }
pub struct OrderedMap<K: 'static, V: 'static> { pub struct OrderedMap<K: 'static, V: 'static> {
@ -73,6 +103,20 @@ impl<V> OrderedMap<crate::InsensitiveStr<'_>, V> {
} }
} }
impl<V> OrderedMap<crate::InsensitiveAscii<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
self.keys
.binary_search_by_key(word, |key| key.convert())
.map(|i| &self.values[i])
.ok()
} else {
None
}
}
}
impl<V> OrderedMap<&str, V> { impl<V> OrderedMap<&str, V> {
#[inline] #[inline]
pub fn find(&self, word: &'_ &str) -> Option<&'static V> { pub fn find(&self, word: &'_ &str) -> Option<&'static V> {

View file

@ -75,7 +75,7 @@ impl<V> Trie<V> {
// Unsafe: Everything before has been proven to be ASCII, so this should be // Unsafe: Everything before has been proven to be ASCII, so this should be
// safe. // safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) }; let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
let remaining = unicase::UniCase::ascii(remaining); let remaining = unicase::Ascii::new(remaining);
return t.find(&remaining); return t.find(&remaining);
} }
} }
@ -91,7 +91,7 @@ pub struct TrieNode<V: 'static> {
pub enum TrieChild<V: 'static> { pub enum TrieChild<V: 'static> {
Nested(&'static [Option<&'static TrieNode<V>>; 26]), Nested(&'static [Option<&'static TrieNode<V>>; 26]),
Flat(&'static crate::OrderedMap<crate::InsensitiveStr<'static>, V>), Flat(&'static crate::OrderedMap<crate::InsensitiveAscii<'static>, V>),
} }
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
@ -179,6 +179,7 @@ mod codegen {
.name(&children_name) .name(&children_name)
.value_type(value_type) .value_type(value_type)
.ordered_map() .ordered_map()
.unicode(false)
.write(file, table_input)?; .write(file, table_input)?;
} }
} }

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff