From 4c2f2c434a24803f767fc88c8a131ea571137b73 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 1 Jul 2021 10:39:27 -0500 Subject: [PATCH] feat(dict): Shared PHF support --- Cargo.lock | 170 ++++++++++++++++++++++-------------- crates/dictgen/Cargo.toml | 8 +- crates/dictgen/src/lib.rs | 4 + crates/dictgen/src/map.rs | 91 +++++++++++++++++++ crates/dictgen/src/table.rs | 62 +++++++++++-- 5 files changed, 258 insertions(+), 77 deletions(-) create mode 100644 crates/dictgen/src/map.rs diff --git a/Cargo.lock b/Cargo.lock index bc2a5e8..c46f19a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03345e98af8f3d786b6d9f656ccfa6ac316d954e92bc4841f0bba20789d5fb5a" +checksum = "e7a2e47a1fbe209ee101dd6d61285226744c6c8d3c21c8dc878ba6cb9f467f3a" dependencies = [ "gimli", ] @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.40" +version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" +checksum = "15af2628f6890fe2609a3b91bef4c83450512802e59489f9c1cb1fa5df064a61" [[package]] name = "arrayvec" @@ -60,9 +60,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "assert_cmd" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f57fec1ac7e4de72dcc69811795f1a7172ed06012f80a5d1ee651b62484f588" +checksum = "a88b6bd5df287567ffdf4ddf4d33060048e1068308e5f62d81c6f9824a045a48" dependencies = [ "bstr", "doc-comment", @@ -105,9 +105,9 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "backtrace" -version = "0.3.59" +version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4717cfcbfaa661a0fd48f8453951837ae7e8f81e481fbb136e3202d72805a744" +checksum = "b7815ea54e4d821e791162e078acbebfd6d8c8939cd559c9335dceb1c8ca7282" dependencies = [ "addr2line", "cc", @@ -154,12 +154,6 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" -[[package]] -name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - [[package]] name = "cast" version = "0.2.6" @@ -224,7 +218,7 @@ version = "0.4.0" dependencies = [ "codegenrs", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", ] @@ -263,7 +257,7 @@ dependencies = [ "clap", "criterion-plot", "csv", - "itertools 0.10.0", + "itertools 0.10.1", "lazy_static", "num-traits", "oorandom", @@ -417,6 +411,9 @@ dependencies = [ name = "dictgen" version = "0.1.0" dependencies = [ + "phf 0.9.0", + "phf_codegen", + "phf_shared 0.9.0", "unicase", ] @@ -549,9 +546,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f" +checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "log", "termcolor", @@ -597,9 +594,9 @@ checksum = "0e4075386626662786ddb0ec9081e7c7eeb1ba31951f447ca780ef9f5d568189" [[package]] name = "globset" -version = "0.4.6" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c152169ef1e421390738366d2f796655fec62621dabbd0fd476f905934061e4a" +checksum = "10463d9ff00a2a068db14231982f5132edebad0d7660cd956a1c30292dbcbfbd" dependencies = [ "aho-corasick", "bstr", @@ -627,18 +624,18 @@ checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" [[package]] name = "heck" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" dependencies = [ "unicode-segmentation", ] [[package]] name = "hermit-abi" -version = "0.1.18" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] @@ -675,9 +672,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "ignore" -version = "0.4.17" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b287fb45c60bb826a0dc68ff08742b9d88a2fea13d6e0c286b3172065aaf878c" +checksum = "713f1b139373f96a2e0ce3ac931cd01ee973c3c5dd7c40c0c2efe96ad2b6751d" dependencies = [ "crossbeam-utils", "globset", @@ -702,9 +699,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" dependencies = [ "either", ] @@ -754,9 +751,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" +checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" [[package]] name = "log" @@ -804,7 +801,7 @@ version = "0.4.0" dependencies = [ "codegenrs", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "regex", "structopt", "unicase", @@ -859,15 +856,18 @@ dependencies = [ [[package]] name = "object" -version = "0.24.0" +version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a5b3dd1c072ee7963717671d1ca129f1048fda25edea6b752bfc71ac8854170" +checksum = "a38f2be3697a57b4060074ff41b44c16870d916ad7877c17696e063257482bc7" +dependencies = [ + "memchr", +] [[package]] name = "once_cell" -version = "1.7.2" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" [[package]] name = "oorandom" @@ -899,7 +899,36 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" dependencies = [ - "phf_shared", + "phf_shared 0.8.0", +] + +[[package]] +name = "phf" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ac8b67553a7ca9457ce0e526948cad581819238f4a9d1ea74545851fa24f37" +dependencies = [ + "phf_shared 0.9.0", +] + +[[package]] +name = "phf_codegen" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963adb11cf22ee65dfd401cf75577c1aa0eca58c0b97f9337d2da61d3e640503" +dependencies = [ + "phf_generator", + "phf_shared 0.9.0", +] + +[[package]] +name = "phf_generator" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc1437ada0f3a97d538f0bb608137bf53c53969028cab74c89893e1e9a12f0e" +dependencies = [ + "phf_shared 0.9.0", + "rand", ] [[package]] @@ -912,6 +941,16 @@ dependencies = [ "unicase", ] +[[package]] +name = "phf_shared" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68318426de33640f02be62b4ae8eb1261be2efbc337b60c54d845bf4484e0d9" +dependencies = [ + "siphasher", + "unicase", +] + [[package]] name = "plotters" version = "0.3.1" @@ -927,15 +966,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590" +checksum = "fd8be10f7485c8a323ea100b20d6052c27cf5968f08f8e3a56ee9f0cf38ebd3d" [[package]] name = "plotters-svg" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" dependencies = [ "plotters-backend", ] @@ -1037,9 +1076,9 @@ checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" [[package]] name = "rand" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" dependencies = [ "libc", "rand_chacha", @@ -1049,9 +1088,9 @@ dependencies = [ [[package]] name = "rand_chacha" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", @@ -1059,18 +1098,18 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" dependencies = [ "getrandom", ] [[package]] name = "rand_hc" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" dependencies = [ "rand_core", ] @@ -1102,9 +1141,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc" +checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" dependencies = [ "bitflags", ] @@ -1122,12 +1161,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" -dependencies = [ - "byteorder", -] +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" @@ -1146,9 +1182,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410f7acf3cb3a44527c5d9546bad4bf4e6c460915d5f9f2fc524498bfe8f70ce" +checksum = "dead70b0b5e03e9c814bcb6b01e03e68f7c57a80aa48c72ec92152ab3e818d49" [[package]] name = "rustc_version" @@ -1295,9 +1331,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.72" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82" +checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" dependencies = [ "proc-macro2", "quote", @@ -1408,7 +1444,7 @@ version = "0.7.0" dependencies = [ "anyhow", "bstr", - "itertools 0.10.0", + "itertools 0.10.1", "log", "nom", "once_cell", @@ -1437,15 +1473,15 @@ dependencies = [ "derive_setters", "difflib", "encoding", - "env_logger 0.8.3", + "env_logger 0.8.4", "human-panic", "ignore", - "itertools 0.10.0", + "itertools 0.10.1", "kstring", "log", "maplit", "once_cell", - "phf", + "phf 0.8.0", "predicates", "proc-exit", "serde", @@ -1478,7 +1514,7 @@ dependencies = [ "codegenrs", "csv", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", ] @@ -1490,7 +1526,7 @@ dependencies = [ "codegenrs", "csv", "edit-distance", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", "varcon", @@ -1515,7 +1551,7 @@ dependencies = [ "codegenrs", "dictgen", "env_logger 0.7.1", - "itertools 0.10.0", + "itertools 0.10.1", "log", "structopt", "typos", @@ -1698,7 +1734,7 @@ version = "0.4.0" dependencies = [ "codegenrs", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", ] diff --git a/crates/dictgen/Cargo.toml b/crates/dictgen/Cargo.toml index 8d2e15c..be39397 100644 --- a/crates/dictgen/Cargo.toml +++ b/crates/dictgen/Cargo.toml @@ -9,9 +9,13 @@ license = "MIT" edition = "2018" [features] -default = ["std", "codegen"] +default = ["std"] std = [] -codegen = ["std"] +codegen = ["std", "phf_codegen"] +map = ["phf", "phf_shared"] [dependencies] unicase = "2.5" +phf = { version = "0.9", features = ["unicase"], optional = true } +phf_codegen = { version = "0.9", optional = true } +phf_shared = { version = "0.9", optional = true } diff --git a/crates/dictgen/src/lib.rs b/crates/dictgen/src/lib.rs index 5b9819b..c870b7f 100644 --- a/crates/dictgen/src/lib.rs +++ b/crates/dictgen/src/lib.rs @@ -1,5 +1,9 @@ +#[cfg(feature = "map")] +mod map; mod table; mod trie; +#[cfg(feature = "map")] +pub use map::*; pub use table::*; pub use trie::*; diff --git a/crates/dictgen/src/map.rs b/crates/dictgen/src/map.rs new file mode 100644 index 0000000..b7c84d2 --- /dev/null +++ b/crates/dictgen/src/map.rs @@ -0,0 +1,91 @@ +#[cfg(feature = "codegen")] +pub fn generate_map<'d, W: std::io::Write, V: std::fmt::Display>( + file: &mut W, + name: &str, + value_type: &str, + data: impl Iterator, +) -> Result<(), std::io::Error> { + let mut data: Vec<_> = data.collect(); + data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0)); + + let mut smallest = usize::MAX; + let mut largest = usize::MIN; + + writeln!( + file, + "pub static {}: dictgen::DictTable<{}> = dictgen::DictTable {{", + name, value_type + )?; + writeln!(file, " keys: &[")?; + for (key, _value) in data.iter() { + smallest = std::cmp::min(smallest, key.len()); + largest = std::cmp::max(largest, key.len()); + + let key = if key.is_ascii() { + format!("dictgen::InsensitiveStr::Ascii({:?})", key) + } else { + format!("dictgen::InsensitiveStr::Unicode({:?})", key) + }; + + writeln!(file, " {},", key)?; + } + if largest == 0 { + smallest = 0; + } + writeln!(file, " ],")?; + writeln!(file, " values: &[")?; + for (_key, value) in data.iter() { + writeln!(file, " {},", value)?; + } + writeln!(file, " ],")?; + writeln!(file, " range: {}..={},", smallest, largest)?; + writeln!(file, "}};")?; + + Ok(()) +} + +pub struct DictMap { + pub map: phf::Map, V>, + pub range: std::ops::RangeInclusive, +} + +impl DictMap { + pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> { + if self.range.contains(&word.len()) { + self.map.get(&(*word).into()) + } else { + None + } + } + + pub fn iter(&self) -> impl Iterator, &V)> + '_ { + self.map.entries().map(|(k, v)| (k.convert(), v)) + } +} + +impl<'s> phf_shared::PhfHash for crate::InsensitiveStr<'s> { + #[inline] + fn phf_hash(&self, state: &mut H) { + core::hash::Hash::hash(self, state) + } +} + +impl<'s> phf_shared::FmtConst for crate::InsensitiveStr<'s> { + fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?, + crate::InsensitiveStr::Unicode(_) => { + f.write_str("dictgen::InsensitiveStr::Unicode(")? + } + } + + self.into_inner().fmt_const(f)?; + f.write_str(")") + } +} + +impl<'b, 'a: 'b> phf_shared::PhfBorrow> for crate::InsensitiveStr<'a> { + fn borrow(&self) -> &crate::InsensitiveStr<'b> { + self + } +} diff --git a/crates/dictgen/src/table.rs b/crates/dictgen/src/table.rs index dda4e93..2452eed 100644 --- a/crates/dictgen/src/table.rs +++ b/crates/dictgen/src/table.rs @@ -45,7 +45,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( } pub struct DictTable { - pub keys: &'static [InsensitiveStr], + pub keys: &'static [InsensitiveStr<'static>], pub values: &'static [V], pub range: core::ops::RangeInclusive, } @@ -67,18 +67,64 @@ impl DictTable { } } -// Avoid unicase's use of const-fn so large tables don't OOM -#[derive(Copy, Clone, Debug)] -pub enum InsensitiveStr { - Unicode(&'static str), - Ascii(&'static str), +/// UniCase look-alike that avoids const-fn so large tables don't OOM +#[derive(Copy, Clone)] +pub enum InsensitiveStr<'s> { + Unicode(&'s str), + Ascii(&'s str), } -impl InsensitiveStr { - fn convert(self) -> unicase::UniCase<&'static str> { +impl<'s> InsensitiveStr<'s> { + pub fn convert(self) -> unicase::UniCase<&'s str> { match self { InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s), InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s), } } + + pub fn into_inner(self) -> &'s str { + match self { + InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s, + } + } +} + +impl<'s> From> for InsensitiveStr<'s> { + fn from(other: unicase::UniCase<&'s str>) -> Self { + if other.is_ascii() { + InsensitiveStr::Ascii(other.into_inner()) + } else { + InsensitiveStr::Unicode(other.into_inner()) + } + } +} + +impl<'s1, 's2> PartialEq> for InsensitiveStr<'s1> { + #[inline] + fn eq(&self, other: &InsensitiveStr<'s2>) -> bool { + self.convert() == other.convert() + } +} + +impl<'s> Eq for InsensitiveStr<'s> {} + +impl<'s> core::hash::Hash for InsensitiveStr<'s> { + #[inline] + fn hash(&self, hasher: &mut H) { + self.convert().hash(hasher) + } +} + +impl<'s> core::fmt::Debug for InsensitiveStr<'s> { + #[inline] + fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result { + core::fmt::Debug::fmt(self.into_inner(), fmt) + } +} + +impl<'s> core::fmt::Display for InsensitiveStr<'s> { + #[inline] + fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result { + core::fmt::Display::fmt(self.into_inner(), fmt) + } }