Merge pull request #303 from epage/phf

feat(dict): Shared PHF support
This commit is contained in:
Ed Page 2021-07-01 11:55:03 -05:00 committed by GitHub
commit fc05aa9633
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 489 additions and 283 deletions

170
Cargo.lock generated
View file

@ -4,9 +4,9 @@ version = 3
[[package]]
name = "addr2line"
version = "0.15.1"
version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03345e98af8f3d786b6d9f656ccfa6ac316d954e92bc4841f0bba20789d5fb5a"
checksum = "e7a2e47a1fbe209ee101dd6d61285226744c6c8d3c21c8dc878ba6cb9f467f3a"
dependencies = [
"gimli",
]
@ -48,9 +48,9 @@ dependencies = [
[[package]]
name = "anyhow"
version = "1.0.40"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b"
checksum = "15af2628f6890fe2609a3b91bef4c83450512802e59489f9c1cb1fa5df064a61"
[[package]]
name = "arrayvec"
@ -60,9 +60,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
[[package]]
name = "assert_cmd"
version = "1.0.4"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f57fec1ac7e4de72dcc69811795f1a7172ed06012f80a5d1ee651b62484f588"
checksum = "a88b6bd5df287567ffdf4ddf4d33060048e1068308e5f62d81c6f9824a045a48"
dependencies = [
"bstr",
"doc-comment",
@ -105,9 +105,9 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "backtrace"
version = "0.3.59"
version = "0.3.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4717cfcbfaa661a0fd48f8453951837ae7e8f81e481fbb136e3202d72805a744"
checksum = "b7815ea54e4d821e791162e078acbebfd6d8c8939cd559c9335dceb1c8ca7282"
dependencies = [
"addr2line",
"cc",
@ -154,12 +154,6 @@ version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631"
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cast"
version = "0.2.6"
@ -224,7 +218,7 @@ version = "0.4.0"
dependencies = [
"codegenrs",
"dictgen",
"itertools 0.10.0",
"itertools 0.10.1",
"structopt",
"unicase",
]
@ -263,7 +257,7 @@ dependencies = [
"clap",
"criterion-plot",
"csv",
"itertools 0.10.0",
"itertools 0.10.1",
"lazy_static",
"num-traits",
"oorandom",
@ -417,6 +411,9 @@ dependencies = [
name = "dictgen"
version = "0.1.0"
dependencies = [
"phf 0.9.0",
"phf_codegen",
"phf_shared 0.9.0",
"unicase",
]
@ -549,9 +546,9 @@ dependencies = [
[[package]]
name = "env_logger"
version = "0.8.3"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f"
checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3"
dependencies = [
"log",
"termcolor",
@ -597,9 +594,9 @@ checksum = "0e4075386626662786ddb0ec9081e7c7eeb1ba31951f447ca780ef9f5d568189"
[[package]]
name = "globset"
version = "0.4.6"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c152169ef1e421390738366d2f796655fec62621dabbd0fd476f905934061e4a"
checksum = "10463d9ff00a2a068db14231982f5132edebad0d7660cd956a1c30292dbcbfbd"
dependencies = [
"aho-corasick",
"bstr",
@ -627,18 +624,18 @@ checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
[[package]]
name = "heck"
version = "0.3.2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac"
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "hermit-abi"
version = "0.1.18"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
@ -675,9 +672,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "ignore"
version = "0.4.17"
version = "0.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b287fb45c60bb826a0dc68ff08742b9d88a2fea13d6e0c286b3172065aaf878c"
checksum = "713f1b139373f96a2e0ce3ac931cd01ee973c3c5dd7c40c0c2efe96ad2b6751d"
dependencies = [
"crossbeam-utils",
"globset",
@ -702,9 +699,9 @@ dependencies = [
[[package]]
name = "itertools"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf"
dependencies = [
"either",
]
@ -754,9 +751,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.95"
version = "0.2.97"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36"
checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6"
[[package]]
name = "log"
@ -804,7 +801,7 @@ version = "0.4.0"
dependencies = [
"codegenrs",
"dictgen",
"itertools 0.10.0",
"itertools 0.10.1",
"regex",
"structopt",
"unicase",
@ -859,15 +856,18 @@ dependencies = [
[[package]]
name = "object"
version = "0.24.0"
version = "0.25.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a5b3dd1c072ee7963717671d1ca129f1048fda25edea6b752bfc71ac8854170"
checksum = "a38f2be3697a57b4060074ff41b44c16870d916ad7877c17696e063257482bc7"
dependencies = [
"memchr",
]
[[package]]
name = "once_cell"
version = "1.7.2"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3"
checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56"
[[package]]
name = "oorandom"
@ -899,7 +899,36 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
"phf_shared 0.8.0",
]
[[package]]
name = "phf"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ac8b67553a7ca9457ce0e526948cad581819238f4a9d1ea74545851fa24f37"
dependencies = [
"phf_shared 0.9.0",
]
[[package]]
name = "phf_codegen"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "963adb11cf22ee65dfd401cf75577c1aa0eca58c0b97f9337d2da61d3e640503"
dependencies = [
"phf_generator",
"phf_shared 0.9.0",
]
[[package]]
name = "phf_generator"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fc1437ada0f3a97d538f0bb608137bf53c53969028cab74c89893e1e9a12f0e"
dependencies = [
"phf_shared 0.9.0",
"rand",
]
[[package]]
@ -912,6 +941,16 @@ dependencies = [
"unicase",
]
[[package]]
name = "phf_shared"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a68318426de33640f02be62b4ae8eb1261be2efbc337b60c54d845bf4484e0d9"
dependencies = [
"siphasher",
"unicase",
]
[[package]]
name = "plotters"
version = "0.3.1"
@ -927,15 +966,15 @@ dependencies = [
[[package]]
name = "plotters-backend"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590"
checksum = "fd8be10f7485c8a323ea100b20d6052c27cf5968f08f8e3a56ee9f0cf38ebd3d"
[[package]]
name = "plotters-svg"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211"
checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9"
dependencies = [
"plotters-backend",
]
@ -1037,9 +1076,9 @@ checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
[[package]]
name = "rand"
version = "0.8.3"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
dependencies = [
"libc",
"rand_chacha",
@ -1049,9 +1088,9 @@ dependencies = [
[[package]]
name = "rand_chacha"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
@ -1059,18 +1098,18 @@ dependencies = [
[[package]]
name = "rand_core"
version = "0.6.2"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
dependencies = [
"getrandom",
]
[[package]]
name = "rand_hc"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73"
checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
dependencies = [
"rand_core",
]
@ -1102,9 +1141,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.2.8"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee"
dependencies = [
"bitflags",
]
@ -1122,12 +1161,9 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.1.9"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
]
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "regex-syntax"
@ -1146,9 +1182,9 @@ dependencies = [
[[package]]
name = "rustc-demangle"
version = "0.1.19"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "410f7acf3cb3a44527c5d9546bad4bf4e6c460915d5f9f2fc524498bfe8f70ce"
checksum = "dead70b0b5e03e9c814bcb6b01e03e68f7c57a80aa48c72ec92152ab3e818d49"
[[package]]
name = "rustc_version"
@ -1295,9 +1331,9 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.72"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7"
dependencies = [
"proc-macro2",
"quote",
@ -1408,7 +1444,7 @@ version = "0.7.0"
dependencies = [
"anyhow",
"bstr",
"itertools 0.10.0",
"itertools 0.10.1",
"log",
"nom",
"once_cell",
@ -1437,15 +1473,15 @@ dependencies = [
"derive_setters",
"difflib",
"encoding",
"env_logger 0.8.3",
"env_logger 0.8.4",
"human-panic",
"ignore",
"itertools 0.10.0",
"itertools 0.10.1",
"kstring",
"log",
"maplit",
"once_cell",
"phf",
"phf 0.8.0",
"predicates",
"proc-exit",
"serde",
@ -1478,7 +1514,7 @@ dependencies = [
"codegenrs",
"csv",
"dictgen",
"itertools 0.10.0",
"itertools 0.10.1",
"structopt",
"unicase",
]
@ -1490,7 +1526,7 @@ dependencies = [
"codegenrs",
"csv",
"edit-distance",
"itertools 0.10.0",
"itertools 0.10.1",
"structopt",
"unicase",
"varcon",
@ -1515,7 +1551,7 @@ dependencies = [
"codegenrs",
"dictgen",
"env_logger 0.7.1",
"itertools 0.10.0",
"itertools 0.10.1",
"log",
"structopt",
"typos",
@ -1698,7 +1734,7 @@ version = "0.4.0"
dependencies = [
"codegenrs",
"dictgen",
"itertools 0.10.0",
"itertools 0.10.1",
"structopt",
"unicase",
]

View file

@ -22,4 +22,4 @@ unicase = "2.5"
itertools = "0.10"
codegenrs = "1.0"
structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" }
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -1,7 +1,21 @@
[package]
name = "dictgen"
version = "0.1.0"
description = "Compile-time case-insensitive map"
repository = "https://github.com/crate-ci/typos"
categories = ["development-tools", "text-processing"]
keywords = ["development", "spelling", "no_std"]
license = "MIT"
edition = "2018"
[features]
default = ["std"]
std = []
codegen = ["std", "phf_codegen"]
map = ["phf", "phf_shared"]
[dependencies]
unicase = "2.5"
phf = { version = "0.9", features = ["unicase"], optional = true }
phf_codegen = { version = "0.9", optional = true }
phf_shared = { version = "0.9", optional = true }

View file

@ -1,5 +1,9 @@
#[cfg(feature = "map")]
mod map;
mod table;
mod trie;
#[cfg(feature = "map")]
pub use map::*;
pub use table::*;
pub use trie::*;

91
crates/dictgen/src/map.rs Normal file
View file

@ -0,0 +1,91 @@
#[cfg(feature = "codegen")]
pub fn generate_map<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W,
name: &str,
value_type: &str,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
let mut smallest = usize::MAX;
let mut largest = usize::MIN;
writeln!(
file,
"pub static {}: dictgen::DictTable<{}> = dictgen::DictTable {{",
name, value_type
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({:?})", key)
} else {
format!("dictgen::InsensitiveStr::Unicode({:?})", key)
};
writeln!(file, " {},", key)?;
}
if largest == 0 {
smallest = 0;
}
writeln!(file, " ],")?;
writeln!(file, " values: &[")?;
for (_key, value) in data.iter() {
writeln!(file, " {},", value)?;
}
writeln!(file, " ],")?;
writeln!(file, " range: {}..={},", smallest, largest)?;
writeln!(file, "}};")?;
Ok(())
}
pub struct DictMap<V: 'static> {
pub map: phf::Map<crate::InsensitiveStr<'static>, V>,
pub range: std::ops::RangeInclusive<usize>,
}
impl<V> DictMap<V> {
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
if self.range.contains(&word.len()) {
self.map.get(&(*word).into())
} else {
None
}
}
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&str>, &V)> + '_ {
self.map.entries().map(|(k, v)| (k.convert(), v))
}
}
impl<'s> phf_shared::PhfHash for crate::InsensitiveStr<'s> {
#[inline]
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
core::hash::Hash::hash(self, state)
}
}
impl<'s> phf_shared::FmtConst for crate::InsensitiveStr<'s> {
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
crate::InsensitiveStr::Unicode(_) => {
f.write_str("dictgen::InsensitiveStr::Unicode(")?
}
}
self.into_inner().fmt_const(f)?;
f.write_str(")")
}
}
impl<'b, 'a: 'b> phf_shared::PhfBorrow<crate::InsensitiveStr<'b>> for crate::InsensitiveStr<'a> {
fn borrow(&self) -> &crate::InsensitiveStr<'b> {
self
}
}

View file

@ -1,3 +1,4 @@
#[cfg(feature = "codegen")]
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W,
name: &str,
@ -44,9 +45,9 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
}
pub struct DictTable<V: 'static> {
pub keys: &'static [InsensitiveStr],
pub keys: &'static [InsensitiveStr<'static>],
pub values: &'static [V],
pub range: std::ops::RangeInclusive<usize>,
pub range: core::ops::RangeInclusive<usize>,
}
impl<V> DictTable<V> {
@ -66,18 +67,64 @@ impl<V> DictTable<V> {
}
}
// Avoid unicase's use of const-fn so large tables don't OOM
#[derive(Copy, Clone, Debug)]
pub enum InsensitiveStr {
Unicode(&'static str),
Ascii(&'static str),
/// UniCase look-alike that avoids const-fn so large tables don't OOM
#[derive(Copy, Clone)]
pub enum InsensitiveStr<'s> {
Unicode(&'s str),
Ascii(&'s str),
}
impl InsensitiveStr {
fn convert(self) -> unicase::UniCase<&'static str> {
impl<'s> InsensitiveStr<'s> {
pub fn convert(self) -> unicase::UniCase<&'s str> {
match self {
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
pub fn into_inner(self) -> &'s str {
match self {
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
}
}
}
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
fn from(other: unicase::UniCase<&'s str>) -> Self {
if other.is_ascii() {
InsensitiveStr::Ascii(other.into_inner())
} else {
InsensitiveStr::Unicode(other.into_inner())
}
}
}
impl<'s1, 's2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'s1> {
#[inline]
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
self.convert() == other.convert()
}
}
impl<'s> Eq for InsensitiveStr<'s> {}
impl<'s> core::hash::Hash for InsensitiveStr<'s> {
#[inline]
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
self.convert().hash(hasher)
}
}
impl<'s> core::fmt::Debug for InsensitiveStr<'s> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
core::fmt::Debug::fmt(self.into_inner(), fmt)
}
}
impl<'s> core::fmt::Display for InsensitiveStr<'s> {
#[inline]
fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
core::fmt::Display::fmt(self.into_inner(), fmt)
}
}

View file

@ -1,6 +1,7 @@
/// # Panics
///
/// - On duplicate entry
#[cfg(feature = "codegen")]
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W,
prefix: &str,
@ -8,6 +9,80 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
data: impl Iterator<Item = (&'d str, V)>,
limit: usize,
) -> Result<(), std::io::Error> {
codegen::generate_trie(file, prefix, value_type, data, limit)
}
pub struct DictTrie<V: 'static> {
pub root: &'static DictTrieNode<V>,
pub unicode: &'static crate::DictTable<V>,
pub range: core::ops::RangeInclusive<usize>,
}
impl<V> DictTrie<V> {
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
let bytes = word.as_bytes();
let mut child = &self.root;
for i in 0..bytes.len() {
match child.children {
DictTrieChild::Nested(n) => {
let byte = bytes[i];
let index = if (b'a'..b'z').contains(&byte) {
byte - b'a'
} else if (b'A'..b'Z').contains(&byte) {
byte - b'A'
} else {
return self.unicode.find(word);
};
debug_assert!(index < 26);
if let Some(next) = n[index as usize].as_ref() {
child = next;
} else {
return None;
}
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
}
}
}
child.value.as_ref()
} else {
None
}
}
}
pub struct DictTrieNode<V: 'static> {
pub children: DictTrieChild<V>,
pub value: Option<V>,
}
pub enum DictTrieChild<V: 'static> {
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
Flat(&'static crate::DictTable<V>),
}
#[cfg(feature = "codegen")]
mod codegen {
pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W,
prefix: &str,
value_type: &str,
data: impl Iterator<Item = (&'d str, V)>,
limit: usize,
) -> Result<(), std::io::Error> {
let mut root = DynRoot::new(data);
root.burst(limit);
@ -92,9 +167,9 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
}
Ok(())
}
}
fn gen_node_name(prefix: &str, start: &str) -> String {
fn gen_node_name(prefix: &str, start: &str) -> String {
if start.is_empty() {
format!("{}_NODE", prefix)
} else {
@ -102,9 +177,9 @@ fn gen_node_name(prefix: &str, start: &str) -> String {
start.make_ascii_uppercase();
format!("{}_{}_NODE", prefix, start)
}
}
}
fn gen_children_name(prefix: &str, start: &str) -> String {
fn gen_children_name(prefix: &str, start: &str) -> String {
if start.is_empty() {
format!("{}_CHILDREN", prefix)
} else {
@ -112,84 +187,22 @@ fn gen_children_name(prefix: &str, start: &str) -> String {
start.make_ascii_uppercase();
format!("{}_{}_CHILDREN", prefix, start)
}
}
}
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
match leaf {
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
}
}
}
pub struct DictTrie<V: 'static> {
pub root: &'static DictTrieNode<V>,
pub unicode: &'static crate::DictTable<V>,
pub range: std::ops::RangeInclusive<usize>,
}
impl<V> DictTrie<V> {
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
let bytes = word.as_bytes();
let mut child = &self.root;
for i in 0..bytes.len() {
match child.children {
DictTrieChild::Nested(n) => {
let byte = bytes[i];
let index = if (b'a'..b'z').contains(&byte) {
byte - b'a'
} else if (b'A'..b'Z').contains(&byte) {
byte - b'A'
} else {
return self.unicode.find(word);
};
debug_assert!(index < 26);
if let Some(next) = n[index as usize].as_ref() {
child = next;
} else {
return None;
}
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { std::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
}
}
}
child.value.as_ref()
} else {
None
}
}
}
pub struct DictTrieNode<V: 'static> {
pub children: DictTrieChild<V>,
pub value: Option<V>,
}
pub enum DictTrieChild<V: 'static> {
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
Flat(&'static crate::DictTable<V>),
}
struct DynRoot<'s, V> {
struct DynRoot<'s, V> {
root: DynNode<'s, V>,
unicode: Vec<(&'s str, V)>,
range: std::ops::RangeInclusive<usize>,
}
}
impl<'s, V> DynRoot<'s, V> {
impl<'s, V> DynRoot<'s, V> {
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
let mut overflow = Vec::new();
let mut unicode = Vec::default();
@ -228,25 +241,25 @@ impl<'s, V> DynRoot<'s, V> {
fn burst(&mut self, limit: usize) {
self.root.burst(limit);
}
}
}
struct DynNode<'s, V> {
struct DynNode<'s, V> {
children: DynChild<'s, V>,
value: Option<V>,
}
}
impl<'s, V> DynNode<'s, V> {
impl<'s, V> DynNode<'s, V> {
fn burst(&mut self, limit: usize) {
self.children.burst(limit)
}
}
}
enum DynChild<'s, V> {
enum DynChild<'s, V> {
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
Flat(Vec<(&'s [u8], V)>),
}
}
impl<'s, V> DynChild<'s, V> {
impl<'s, V> DynChild<'s, V> {
fn burst(&mut self, limit: usize) {
match self {
DynChild::Nested(children) => {
@ -287,4 +300,5 @@ impl<'s, V> DynChild<'s, V> {
}
}
}
}
}

View file

@ -23,4 +23,4 @@ itertools = "0.10"
codegenrs = "1.0"
structopt = "0.3"
regex = "1"
dictgen = { version = "0.1", path = "../../dictgen" }
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -23,4 +23,4 @@ itertools = "0.10"
unicase = "2.5"
codegenrs = "1.0"
structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" }
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -29,4 +29,4 @@ log = "0.4"
env_logger = "0.7"
clap-verbosity-flag = "0.3"
itertools = "0.10"
dictgen = { version = "0.1", path = "../../dictgen" }
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -22,4 +22,4 @@ unicase = "2.5"
itertools = "0.10"
codegenrs = "1.0"
structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" }
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }