Merge pull request #1198 from epage/generic

perf(dict)!: Switch to PHF Map
This commit is contained in:
Ed Page 2024-12-31 06:56:23 -06:00 committed by GitHub
commit 44cf2f8cf6
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: B5690EEEBB952194
20 changed files with 909056 additions and 704967 deletions

1
Cargo.lock generated
View file

@ -1509,6 +1509,7 @@ dependencies = [
"dictgen",
"divan",
"edit-distance",
"heck",
"indexmap",
"itertools 0.13.0",
"phf",

File diff suppressed because it is too large Load diff

View file

@ -36,11 +36,19 @@ impl<'g> DictGen<'g> {
#[cfg(feature = "map")]
pub fn map(self) -> crate::MapGen<'g> {
crate::MapGen { gen: self }
crate::MapGen {
gen: self,
unicode: true,
unicase: true,
}
}
pub fn ordered_map(self) -> crate::OrderedMapGen<'g> {
crate::OrderedMapGen { gen: self }
crate::OrderedMapGen {
gen: self,
unicode: true,
unicase: true,
}
}
pub fn trie(self) -> crate::TrieGen<'g> {
@ -49,6 +57,10 @@ impl<'g> DictGen<'g> {
limit: 64,
}
}
pub fn r#match(self) -> crate::MatchGen<'g> {
crate::MatchGen { gen: self }
}
}
impl Default for DictGen<'static> {

View file

@ -51,6 +51,18 @@ impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
impl Eq for InsensitiveStr<'_> {}
impl PartialOrd for InsensitiveStr<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for InsensitiveStr<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.convert().cmp(&other.convert())
}
}
impl core::hash::Hash for InsensitiveStr<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
@ -101,3 +113,97 @@ impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a
self
}
}
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub struct InsensitiveAscii<'s>(pub &'s str);
impl<'s> InsensitiveAscii<'s> {
pub fn convert(self) -> unicase::Ascii<&'s str> {
unicase::Ascii::new(self.0)
}
pub fn into_inner(self) -> &'s str {
self.0
}
pub fn is_empty(self) -> bool {
self.0.is_empty()
}
pub fn len(self) -> usize {
self.0.len()
}
}
impl<'s> From<unicase::Ascii<&'s str>> for InsensitiveAscii<'s> {
fn from(other: unicase::Ascii<&'s str>) -> Self {
Self(other.into_inner())
}
}
impl<'s2> PartialEq<InsensitiveAscii<'s2>> for InsensitiveAscii<'_> {
#[inline]
fn eq(&self, other: &InsensitiveAscii<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl Eq for InsensitiveAscii<'_> {}
impl PartialOrd for InsensitiveAscii<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for InsensitiveAscii<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.convert().cmp(&other.convert())
}
}
impl core::hash::Hash for InsensitiveAscii<'_> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher);
}
}
impl core::fmt::Debug for InsensitiveAscii<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl core::fmt::Display for InsensitiveAscii<'_> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}
#[cfg(feature = "map")]
impl phf_shared::PhfHash for InsensitiveAscii<'_> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state);
}
}
#[cfg(feature = "map")]
impl phf_shared::FmtConst for InsensitiveAscii<'_> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str("dictgen::InsensitiveAscii(")?;
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
#[cfg(feature = "map")]
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveAscii<'b>> for InsensitiveAscii<'a> {
fn borrow(&self) -> &InsensitiveAscii<'b> {
self
}
}

View file

@ -7,6 +7,8 @@ mod gen;
mod insensitive;
#[cfg(feature = "map")]
mod map;
#[cfg(feature = "codegen")]
mod r#match;
mod ordered_map;
mod trie;
@ -16,4 +18,6 @@ pub use insensitive::*;
#[cfg(feature = "map")]
pub use map::*;
pub use ordered_map::*;
#[cfg(feature = "codegen")]
pub use r#match::*;
pub use trie::*;

View file

@ -1,65 +1,120 @@
#[cfg(feature = "codegen")]
pub struct MapGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
pub(crate) unicase: bool,
pub(crate) unicode: bool,
}
#[cfg(feature = "codegen")]
impl MapGen<'_> {
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
pub fn unicase(mut self, yes: bool) -> Self {
self.unicase = yes;
self
}
pub fn unicode(mut self, yes: bool) -> Self {
self.unicode = yes;
self
}
pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (&'d str, V)>,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
let name = self.gen.name;
let key_type = self.key_type();
let value_type = self.gen.value_type;
let mut smallest = usize::MAX;
let mut largest = usize::MIN;
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| {
(
if key.is_ascii() {
crate::InsensitiveStr::Ascii(key)
} else {
crate::InsensitiveStr::Unicode(key)
},
value.to_string(),
)
})
.collect::<Vec<_>>();
for (key, value) in data.iter() {
for (key, _) in data.iter() {
let key = key.as_ref();
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
builder.entry(key, value.as_str());
}
let builder = builder.build();
if largest == 0 {
smallest = 0;
}
writeln!(
file,
"pub static {name}: dictgen::Map<{value_type}> = dictgen::Map {{"
"pub static {name}: dictgen::Map<{key_type}, {value_type}> = dictgen::Map {{"
)?;
writeln!(file, " map: {builder},")?;
match (self.unicase, self.unicode) {
(true, true) => {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| {
let key = key.as_ref();
(
if key.is_ascii() {
crate::InsensitiveStr::Ascii(key)
} else {
crate::InsensitiveStr::Unicode(key)
},
value.to_string(),
)
})
.collect::<Vec<_>>();
for (key, value) in data.iter() {
builder.entry(key, value.as_str());
}
let builder = builder.build();
writeln!(file, " map: {builder},")?;
}
(true, false) => {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string()))
.collect::<Vec<_>>();
for (key, value) in data.iter() {
builder.entry(key, value.as_str());
}
let builder = builder.build();
writeln!(file, " map: {builder},")?;
}
(false, _) => {
let mut builder = phf_codegen::Map::new();
let data = data
.iter()
.map(|(key, value)| (key, value.to_string()))
.collect::<Vec<_>>();
for (key, value) in data.iter() {
builder.entry(key.as_ref(), value.as_str());
}
let builder = builder.build();
writeln!(file, " map: {builder},")?;
}
}
writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;
Ok(())
}
fn key_type(&self) -> &'static str {
match (self.unicase, self.unicode) {
(true, true) => "dictgen::InsensitiveStr<'static>",
(true, false) => "dictgen::InsensitiveAscii<'static>",
(false, _) => "&'static str",
}
}
}
pub struct Map<V: 'static> {
pub map: phf::Map<crate::InsensitiveStr<'static>, V>,
pub struct Map<K: 'static, V: 'static> {
pub map: phf::Map<K, V>,
pub range: std::ops::RangeInclusive<usize>,
}
impl<V> Map<V> {
impl<V> Map<crate::InsensitiveStr<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
if self.range.contains(&word.len()) {
@ -69,3 +124,25 @@ impl<V> Map<V> {
}
}
}
impl<V> Map<crate::InsensitiveAscii<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&V> {
if self.range.contains(&word.len()) {
self.map.get(&(*word).into())
} else {
None
}
}
}
impl<V> Map<&str, V> {
#[inline]
pub fn find(&self, word: &'_ &str) -> Option<&V> {
if self.range.contains(&word.len()) {
self.map.get(word)
} else {
None
}
}
}

View file

@ -0,0 +1,37 @@
#[cfg(feature = "codegen")]
pub struct MatchGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
}
#[cfg(feature = "codegen")]
impl MatchGen<'_> {
pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
let name = self.gen.name;
let value_type = self.gen.value_type;
writeln!(file, "pub struct {name};")?;
writeln!(file, "impl {name} {{")?;
writeln!(
file,
" pub fn find(&self, word: &&str) -> Option<&'static {value_type}> {{"
)?;
writeln!(file, " match *word {{")?;
for (key, value) in data.iter() {
let key = key.as_ref();
writeln!(file, " {key:?} => Some(&{value}.as_slice()),")?;
}
writeln!(file, " _ => None,")?;
writeln!(file, " }}")?;
writeln!(file, " }}")?;
writeln!(file, "}}")?;
Ok(())
}
}

View file

@ -1,19 +1,32 @@
#[cfg(feature = "codegen")]
pub struct OrderedMapGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
pub(crate) unicase: bool,
pub(crate) unicode: bool,
}
#[cfg(feature = "codegen")]
impl OrderedMapGen<'_> {
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
pub fn unicase(mut self, yes: bool) -> Self {
self.unicase = yes;
self
}
pub fn unicode(mut self, yes: bool) -> Self {
self.unicode = yes;
self
}
pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (&'d str, V)>,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
let name = self.gen.name;
let key_type = self.key_type();
let value_type = self.gen.value_type;
let mut smallest = usize::MAX;
@ -21,18 +34,15 @@ impl OrderedMapGen<'_> {
writeln!(
file,
"pub static {name}: dictgen::OrderedMap<{value_type}> = dictgen::OrderedMap {{"
"pub static {name}: dictgen::OrderedMap<{key_type}, {value_type}> = dictgen::OrderedMap {{"
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
let key = key.as_ref();
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
let key = self.key_new(key);
writeln!(file, " {key},")?;
}
@ -50,15 +60,37 @@ impl OrderedMapGen<'_> {
Ok(())
}
fn key_type(&self) -> &'static str {
match (self.unicase, self.unicode) {
(true, true) => "dictgen::InsensitiveStr<'static>",
(true, false) => "dictgen::InsensitiveAscii<'static>",
(false, _) => "&'static str",
}
}
fn key_new(&self, key: &str) -> String {
match (self.unicase, self.unicode) {
(true, true) => {
if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
}
}
(true, false) => format!("dictgen::InsensitiveAscii({key:?})"),
(false, _) => format!("{key:?}"),
}
}
}
pub struct OrderedMap<V: 'static> {
pub keys: &'static [crate::InsensitiveStr<'static>],
pub struct OrderedMap<K: 'static, V: 'static> {
pub keys: &'static [K],
pub values: &'static [V],
pub range: core::ops::RangeInclusive<usize>,
}
impl<V> OrderedMap<V> {
impl<V> OrderedMap<crate::InsensitiveStr<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
@ -71,3 +103,28 @@ impl<V> OrderedMap<V> {
}
}
}
impl<V> OrderedMap<crate::InsensitiveAscii<'_>, V> {
#[inline]
pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
self.keys
.binary_search_by_key(word, |key| key.convert())
.map(|i| &self.values[i])
.ok()
} else {
None
}
}
}
impl<V> OrderedMap<&str, V> {
#[inline]
pub fn find(&self, word: &'_ &str) -> Option<&'static V> {
if self.range.contains(&word.len()) {
self.keys.binary_search(word).map(|i| &self.values[i]).ok()
} else {
None
}
}
}

View file

@ -27,7 +27,7 @@ impl TrieGen<'_> {
pub struct Trie<V: 'static> {
pub root: &'static TrieNode<V>,
pub unicode: &'static crate::OrderedMap<V>,
pub unicode: &'static crate::OrderedMap<crate::InsensitiveStr<'static>, V>,
pub range: core::ops::RangeInclusive<usize>,
}
@ -75,7 +75,7 @@ impl<V> Trie<V> {
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
let remaining = unicase::UniCase::ascii(remaining);
let remaining = unicase::Ascii::new(remaining);
return t.find(&remaining);
}
}
@ -91,7 +91,7 @@ pub struct TrieNode<V: 'static> {
pub enum TrieChild<V: 'static> {
Nested(&'static [Option<&'static TrieNode<V>>; 26]),
Flat(&'static crate::OrderedMap<V>),
Flat(&'static crate::OrderedMap<crate::InsensitiveAscii<'static>, V>),
}
#[cfg(feature = "codegen")]
@ -179,6 +179,7 @@ mod codegen {
.name(&children_name)
.value_type(value_type)
.ordered_map()
.unicode(false)
.write(file, table_input)?;
}
}

File diff suppressed because it is too large Load diff

View file

@ -16,7 +16,8 @@ all-features = true
rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
[dependencies]
dictgen = { version = "^0.2", path = "../dictgen" }
phf = "0.11.2"
dictgen = { version = "^0.2", path = "../dictgen", features = ["map"] }
[dev-dependencies]
csv = "1.3"
@ -29,7 +30,7 @@ varcon = { version = "^1.0", path = "../varcon" }
snapbox = "0.6.5"
indexmap = "2.2.6"
divan = "0.1.16"
phf = "0.11.2"
heck = "0.5.0"
[lints]
workspace = true

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,7 @@
#![allow(clippy::wildcard_imports)]
#![allow(dead_code)]
mod cased_map_codegen;
mod map_codegen;
mod ordered_map_codegen;
mod trie_codegen;
@ -9,6 +11,11 @@ mod miss {
const MISS: &str = "finalizes";
#[divan::bench(args = [MISS])]
fn cased_map(word: &str) -> Option<&'static &[&str]> {
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
}
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
map_codegen::WORD.find(&word)
@ -30,6 +37,11 @@ mod hit {
const HIT: &str = "finallizes";
#[divan::bench(args = [HIT])]
fn cased_map(word: &str) -> Option<&'static &[&str]> {
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
}
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
map_codegen::WORD.find(&word)

View file

@ -2,7 +2,7 @@
#![allow(clippy::unreadable_literal)]
#![allow(unreachable_pub)]
pub static WORD: dictgen::Map<&[&str]> = dictgen::Map {
pub static WORD: dictgen::Map<dictgen::InsensitiveStr<'static>, &[&str]> = dictgen::Map {
map: ::phf::Map {
key: 12913932095322966823,
disps: &[

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,15 @@ fn codegen() {
snapbox::file!["../benches/benches/map_codegen.rs"].raw()
);
let mut cased_map_content = vec![];
generate_cased_map(&mut cased_map_content, "WORD", DICT);
let cased_map_content = String::from_utf8(cased_map_content).unwrap();
let cased_map_content = codegenrs::rustfmt(&cased_map_content, None).unwrap();
snapbox::assert_data_eq!(
&cased_map_content,
snapbox::file!["../benches/benches/cased_map_codegen.rs"].raw()
);
let mut ordered_map_content = vec![];
generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
@ -29,10 +38,7 @@ fn codegen() {
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
);
snapbox::assert_data_eq!(
&trie_content,
snapbox::file!["../src/word_codegen.rs"].raw()
);
snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
}
fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
@ -72,6 +78,111 @@ fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
.unwrap();
}
fn generate_cased_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!(
file,
"// This file is @generated by {}",
file!().replace('\\', "/")
)
.unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
writeln!(file).unwrap();
let records: Vec<_> = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(|r| r.unwrap())
.collect();
dictgen::DictGen::new()
.name(&format!("{name}_ASCII_LOWER"))
.value_type("&[&str]")
.map()
.unicase(false)
.write(
file,
records
.iter()
.filter(|r| r.iter().next().unwrap().is_ascii())
.map(|record| {
let mut record_fields = record.iter();
let key = record_fields.next().unwrap();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
dictgen::DictGen::new()
.name(&format!("{name}_ASCII_UPPER"))
.value_type("&[&str]")
.map()
.unicase(false)
.write(
file,
records
.iter()
.filter(|r| r.iter().next().unwrap().is_ascii())
.map(|record| {
use heck::ToShoutySnakeCase;
let mut record_fields = record.iter();
let key = record_fields.next().unwrap().to_shouty_snake_case();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
dictgen::DictGen::new()
.name(&format!("{name}_ASCII_TITLE"))
.value_type("&[&str]")
.map()
.unicase(false)
.write(
file,
records
.iter()
.filter(|r| r.iter().next().unwrap().is_ascii())
.map(|record| {
use heck::ToTitleCase;
let mut record_fields = record.iter();
let key = record_fields.next().unwrap().to_title_case();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
dictgen::DictGen::new()
.name(&format!("{name}_UNICODE"))
.value_type("&[&str]")
.ordered_map()
.write(
file,
records
.iter()
.filter(|r| !r.iter().next().unwrap().is_ascii())
.map(|record| {
let mut record_fields = record.iter();
let key = record_fields.next().unwrap();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
}
fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!(
file,

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff