Merge pull request #1194 from epage/perf

fix(dictgen)!: Generate phf for map feature
This commit is contained in:
Ed Page 2024-12-30 14:04:15 -06:00 committed by GitHub
commit b87cd87116
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: B5690EEEBB952194
9 changed files with 180646 additions and 138185 deletions

12
Cargo.lock generated
View file

@ -450,6 +450,7 @@ name = "dictgen"
version = "0.2.11" version = "0.2.11"
dependencies = [ dependencies = [
"phf", "phf",
"phf_codegen",
"phf_shared", "phf_shared",
"unicase", "unicase",
] ]
@ -884,6 +885,16 @@ dependencies = [
"phf_shared", "phf_shared",
] ]
[[package]]
name = "phf_codegen"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]] [[package]]
name = "phf_generator" name = "phf_generator"
version = "0.11.2" version = "0.11.2"
@ -1500,6 +1511,7 @@ dependencies = [
"edit-distance", "edit-distance",
"indexmap", "indexmap",
"itertools 0.13.0", "itertools 0.13.0",
"phf",
"snapbox", "snapbox",
"unicase", "unicase",
"varcon", "varcon",

View file

@ -17,13 +17,14 @@ rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
[features] [features]
default = ["std"] default = ["std"]
std = [] std = []
codegen = ["std"] codegen = ["std", "dep:phf_codegen"]
map = ["dep:phf", "dep:phf_shared"] map = ["dep:phf", "dep:phf_shared"]
[dependencies] [dependencies]
unicase = "2.7" unicase = "2.7"
phf = { version = "0.11", features = ["unicase"], optional = true } phf = { version = "0.11", features = ["unicase"], optional = true }
phf_shared = { version = "0.11", optional = true } phf_shared = { version = "0.11", optional = true }
phf_codegen = { version = "0.11", optional = true }
[lints] [lints]
workspace = true workspace = true

View file

@ -0,0 +1,103 @@
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub enum InsensitiveStr<'s> {
Unicode(&'s str),
Ascii(&'s str),
}
impl<'s> InsensitiveStr<'s> {
pub fn convert(self) -> unicase::UniCase<&'s str> {
match self {
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
pub fn into_inner(self) -> &'s str {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
}
}
pub fn is_empty(self) -> bool {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.is_empty(),
}
}
pub fn len(self) -> usize {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.len(),
}
}
}
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
fn from(other: unicase::UniCase<&'s str>) -> Self {
if other.is_ascii() {
InsensitiveStr::Ascii(other.into_inner())
} else {
InsensitiveStr::Unicode(other.into_inner())
}
}
}
impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
#[inline]
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl Eq for InsensitiveStr<'_> {}
impl core::hash::Hash for InsensitiveStr<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}
impl core::fmt::Debug for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl core::fmt::Display for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}
#[cfg(feature = "map")]
impl phf_shared::PhfHash for InsensitiveStr<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}
#[cfg(feature = "map")]
impl phf_shared::FmtConst for InsensitiveStr<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
InsensitiveStr::Unicode(_) => {
f.write_str("dictgen::InsensitiveStr::Unicode(")?;
}
}
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
#[cfg(feature = "map")]
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a> {
fn borrow(&self) -> &InsensitiveStr<'b> {
self
}
}

View file

@ -4,6 +4,7 @@
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
mod gen; mod gen;
mod insensitive;
#[cfg(feature = "map")] #[cfg(feature = "map")]
mod map; mod map;
mod table; mod table;
@ -11,6 +12,7 @@ mod trie;
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
pub use gen::*; pub use gen::*;
pub use insensitive::*;
#[cfg(feature = "map")] #[cfg(feature = "map")]
pub use map::*; pub use map::*;
pub use table::*; pub use table::*;

View file

@ -18,33 +18,35 @@ impl DictMapGen<'_> {
let mut smallest = usize::MAX; let mut smallest = usize::MAX;
let mut largest = usize::MIN; let mut largest = usize::MIN;
let mut builder = phf_codegen::Map::new();
writeln!( let data = data
file, .iter()
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{" .map(|(key, value)| {
)?; (
writeln!(file, " keys: &[")?; if key.is_ascii() {
for (key, _value) in data.iter() { crate::InsensitiveStr::Ascii(key)
} else {
crate::InsensitiveStr::Unicode(key)
},
value.to_string(),
)
})
.collect::<Vec<_>>();
for (key, value) in data.iter() {
smallest = std::cmp::min(smallest, key.len()); smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len()); largest = std::cmp::max(largest, key.len());
builder.entry(key, value.as_str());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?;
} }
let builder = builder.build();
if largest == 0 { if largest == 0 {
smallest = 0; smallest = 0;
} }
writeln!(file, " ],")?;
writeln!(file, " values: &[")?; writeln!(
for (_key, value) in data.iter() { file,
writeln!(file, " {value},")?; "pub static {name}: dictgen::DictMap<{value_type}> = dictgen::DictMap {{"
} )?;
writeln!(file, " ],")?; writeln!(file, " map: {builder},")?;
writeln!(file, " range: {smallest}..={largest},")?; writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?; writeln!(file, "}};")?;
@ -58,6 +60,7 @@ pub struct DictMap<V: 'static> {
} }
impl<V> DictMap<V> { impl<V> DictMap<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> { pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
if self.range.contains(&word.len()) { if self.range.contains(&word.len()) {
self.map.get(&(*word).into()) self.map.get(&(*word).into())
@ -65,35 +68,4 @@ impl<V> DictMap<V> {
None None
} }
} }
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&str>, &V)> + '_ {
self.map.entries().map(|(k, v)| (k.convert(), v))
}
}
impl phf_shared::PhfHash for crate::InsensitiveStr<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}
impl phf_shared::FmtConst for crate::InsensitiveStr<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
crate::InsensitiveStr::Unicode(_) => {
f.write_str("dictgen::InsensitiveStr::Unicode(")?;
}
}
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
impl<'b, 'a: 'b> phf_shared::PhfBorrow<crate::InsensitiveStr<'b>> for crate::InsensitiveStr<'a> {
fn borrow(&self) -> &crate::InsensitiveStr<'b> {
self
}
} }

View file

@ -53,12 +53,13 @@ impl DictTableGen<'_> {
} }
pub struct DictTable<V: 'static> { pub struct DictTable<V: 'static> {
pub keys: &'static [InsensitiveStr<'static>], pub keys: &'static [crate::InsensitiveStr<'static>],
pub values: &'static [V], pub values: &'static [V],
pub range: core::ops::RangeInclusive<usize>, pub range: core::ops::RangeInclusive<usize>,
} }
impl<V> DictTable<V> { impl<V> DictTable<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> { pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) { if self.range.contains(&word.len()) {
self.keys self.keys
@ -69,70 +70,4 @@ impl<V> DictTable<V> {
None None
} }
} }
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> + '_ {
(0..self.keys.len()).map(move |i| (self.keys[i].convert(), &self.values[i]))
}
}
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub enum InsensitiveStr<'s> {
Unicode(&'s str),
Ascii(&'s str),
}
impl<'s> InsensitiveStr<'s> {
pub fn convert(self) -> unicase::UniCase<&'s str> {
match self {
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
pub fn into_inner(self) -> &'s str {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
}
}
}
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
fn from(other: unicase::UniCase<&'s str>) -> Self {
if other.is_ascii() {
InsensitiveStr::Ascii(other.into_inner())
} else {
InsensitiveStr::Unicode(other.into_inner())
}
}
}
impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
#[inline]
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl Eq for InsensitiveStr<'_> {}
impl core::hash::Hash for InsensitiveStr<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}
impl core::fmt::Debug for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl core::fmt::Display for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
} }

View file

@ -32,48 +32,50 @@ pub struct DictTrie<V: 'static> {
} }
impl<V> DictTrie<V> { impl<V> DictTrie<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> { pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) { if word.is_ascii() {
let bytes = word.as_bytes(); if self.range.contains(&word.len()) {
self.find_ascii(word.as_bytes())
} else {
None
}
} else {
self.unicode.find(word)
}
}
let mut child = &self.root; fn find_ascii(&self, word: &[u8]) -> Option<&'static V> {
for i in 0..bytes.len() { let mut child = &self.root;
match child.children { for i in 0..word.len() {
DictTrieChild::Nested(n) => { match child.children {
let byte = bytes[i]; DictTrieChild::Nested(n) => {
let index = if byte.is_ascii_lowercase() { let byte = word[i];
byte - b'a' let index = if byte.is_ascii_lowercase() {
} else if byte.is_ascii_uppercase() { byte - b'a'
byte - b'A' } else if byte.is_ascii_uppercase() {
} else { byte - b'A'
return self.unicode.find(word); } else {
}; return None;
debug_assert!(index < 26); };
if let Some(next) = n[index as usize].as_ref() { debug_assert!(index < 26);
child = next; if let Some(next) = n[index as usize].as_ref() {
} else { child = next;
return None; } else {
} return None;
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
} }
} }
DictTrieChild::Flat(t) => {
let remaining = &word[i..word.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
let remaining = unicase::UniCase::ascii(remaining);
return t.find(&remaining);
}
} }
child.value.as_ref()
} else {
None
} }
child.value.as_ref()
} }
} }

View file

@ -29,6 +29,7 @@ varcon = { version = "^1.0", path = "../varcon" }
snapbox = "0.6.5" snapbox = "0.6.5"
indexmap = "2.2.6" indexmap = "2.2.6"
divan = "0.1.16" divan = "0.1.16"
phf = "0.11.2"
[lints] [lints]
workspace = true workspace = true

File diff suppressed because it is too large Load diff