Merge pull request #1194 from epage/perf

fix(dictgen)!: Generate phf for map feature
This commit is contained in:
Ed Page 2024-12-30 14:04:15 -06:00 committed by GitHub
commit b87cd87116
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: B5690EEEBB952194
9 changed files with 180646 additions and 138185 deletions

12
Cargo.lock generated
View file

@ -450,6 +450,7 @@ name = "dictgen"
version = "0.2.11"
dependencies = [
"phf",
"phf_codegen",
"phf_shared",
"unicase",
]
@ -884,6 +885,16 @@ dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
@ -1500,6 +1511,7 @@ dependencies = [
"edit-distance",
"indexmap",
"itertools 0.13.0",
"phf",
"snapbox",
"unicase",
"varcon",

View file

@ -17,13 +17,14 @@ rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
[features]
default = ["std"]
std = []
codegen = ["std"]
codegen = ["std", "dep:phf_codegen"]
map = ["dep:phf", "dep:phf_shared"]
[dependencies]
unicase = "2.7"
phf = { version = "0.11", features = ["unicase"], optional = true }
phf_shared = { version = "0.11", optional = true }
phf_codegen = { version = "0.11", optional = true }
[lints]
workspace = true

View file

@ -0,0 +1,103 @@
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub enum InsensitiveStr<'s> {
Unicode(&'s str),
Ascii(&'s str),
}
impl<'s> InsensitiveStr<'s> {
pub fn convert(self) -> unicase::UniCase<&'s str> {
match self {
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
pub fn into_inner(self) -> &'s str {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
}
}
pub fn is_empty(self) -> bool {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.is_empty(),
}
}
pub fn len(self) -> usize {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.len(),
}
}
}
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
fn from(other: unicase::UniCase<&'s str>) -> Self {
if other.is_ascii() {
InsensitiveStr::Ascii(other.into_inner())
} else {
InsensitiveStr::Unicode(other.into_inner())
}
}
}
impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
#[inline]
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl Eq for InsensitiveStr<'_> {}
impl core::hash::Hash for InsensitiveStr<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}
impl core::fmt::Debug for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl core::fmt::Display for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}
#[cfg(feature = "map")]
impl phf_shared::PhfHash for InsensitiveStr<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}
#[cfg(feature = "map")]
impl phf_shared::FmtConst for InsensitiveStr<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
InsensitiveStr::Unicode(_) => {
f.write_str("dictgen::InsensitiveStr::Unicode(")?;
}
}
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
#[cfg(feature = "map")]
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a> {
fn borrow(&self) -> &InsensitiveStr<'b> {
self
}
}

View file

@ -4,6 +4,7 @@
#[cfg(feature = "codegen")]
mod gen;
mod insensitive;
#[cfg(feature = "map")]
mod map;
mod table;
@ -11,6 +12,7 @@ mod trie;
#[cfg(feature = "codegen")]
pub use gen::*;
pub use insensitive::*;
#[cfg(feature = "map")]
pub use map::*;
pub use table::*;

View file

@ -18,33 +18,35 @@ impl DictMapGen<'_> {
let mut smallest = usize::MAX;
let mut largest = usize::MIN;
writeln!(
file,
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| {
(
if key.is_ascii() {
crate::InsensitiveStr::Ascii(key)
} else {
crate::InsensitiveStr::Unicode(key)
},
value.to_string(),
)
})
.collect::<Vec<_>>();
for (key, value) in data.iter() {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?;
builder.entry(key, value.as_str());
}
let builder = builder.build();
if largest == 0 {
smallest = 0;
}
writeln!(file, " ],")?;
writeln!(file, " values: &[")?;
for (_key, value) in data.iter() {
writeln!(file, " {value},")?;
}
writeln!(file, " ],")?;
writeln!(
file,
"pub static {name}: dictgen::DictMap<{value_type}> = dictgen::DictMap {{"
)?;
writeln!(file, " map: {builder},")?;
writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;
@ -58,6 +60,7 @@ pub struct DictMap<V: 'static> {
}
impl<V> DictMap<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
if self.range.contains(&word.len()) {
self.map.get(&(*word).into())
@ -65,35 +68,4 @@ impl<V> DictMap<V> {
None
}
}
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&str>, &V)> + '_ {
self.map.entries().map(|(k, v)| (k.convert(), v))
}
}
impl phf_shared::PhfHash for crate::InsensitiveStr<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}
impl phf_shared::FmtConst for crate::InsensitiveStr<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
crate::InsensitiveStr::Unicode(_) => {
f.write_str("dictgen::InsensitiveStr::Unicode(")?;
}
}
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
impl<'b, 'a: 'b> phf_shared::PhfBorrow<crate::InsensitiveStr<'b>> for crate::InsensitiveStr<'a> {
fn borrow(&self) -> &crate::InsensitiveStr<'b> {
self
}
}

View file

@ -53,12 +53,13 @@ impl DictTableGen<'_> {
}
pub struct DictTable<V: 'static> {
pub keys: &'static [InsensitiveStr<'static>],
pub keys: &'static [crate::InsensitiveStr<'static>],
pub values: &'static [V],
pub range: core::ops::RangeInclusive<usize>,
}
impl<V> DictTable<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
self.keys
@ -69,70 +70,4 @@ impl<V> DictTable<V> {
None
}
}
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> + '_ {
(0..self.keys.len()).map(move |i| (self.keys[i].convert(), &self.values[i]))
}
}
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub enum InsensitiveStr<'s> {
Unicode(&'s str),
Ascii(&'s str),
}
impl<'s> InsensitiveStr<'s> {
pub fn convert(self) -> unicase::UniCase<&'s str> {
match self {
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
pub fn into_inner(self) -> &'s str {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
}
}
}
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
fn from(other: unicase::UniCase<&'s str>) -> Self {
if other.is_ascii() {
InsensitiveStr::Ascii(other.into_inner())
} else {
InsensitiveStr::Unicode(other.into_inner())
}
}
}
impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
#[inline]
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl Eq for InsensitiveStr<'_> {}
impl core::hash::Hash for InsensitiveStr<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}
impl core::fmt::Debug for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl core::fmt::Display for InsensitiveStr<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}

View file

@ -32,48 +32,50 @@ pub struct DictTrie<V: 'static> {
}
impl<V> DictTrie<V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
let bytes = word.as_bytes();
if word.is_ascii() {
if self.range.contains(&word.len()) {
self.find_ascii(word.as_bytes())
} else {
None
}
} else {
self.unicode.find(word)
}
}
let mut child = &self.root;
for i in 0..bytes.len() {
match child.children {
DictTrieChild::Nested(n) => {
let byte = bytes[i];
let index = if byte.is_ascii_lowercase() {
byte - b'a'
} else if byte.is_ascii_uppercase() {
byte - b'A'
} else {
return self.unicode.find(word);
};
debug_assert!(index < 26);
if let Some(next) = n[index as usize].as_ref() {
child = next;
} else {
return None;
}
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
fn find_ascii(&self, word: &[u8]) -> Option<&'static V> {
let mut child = &self.root;
for i in 0..word.len() {
match child.children {
DictTrieChild::Nested(n) => {
let byte = word[i];
let index = if byte.is_ascii_lowercase() {
byte - b'a'
} else if byte.is_ascii_uppercase() {
byte - b'A'
} else {
return None;
};
debug_assert!(index < 26);
if let Some(next) = n[index as usize].as_ref() {
child = next;
} else {
return None;
}
}
DictTrieChild::Flat(t) => {
let remaining = &word[i..word.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
let remaining = unicase::UniCase::ascii(remaining);
return t.find(&remaining);
}
}
child.value.as_ref()
} else {
None
}
child.value.as_ref()
}
}

View file

@ -29,6 +29,7 @@ varcon = { version = "^1.0", path = "../varcon" }
snapbox = "0.6.5"
indexmap = "2.2.6"
divan = "0.1.16"
phf = "0.11.2"
[lints]
workspace = true

File diff suppressed because it is too large Load diff