diff --git a/Cargo.lock b/Cargo.lock index bc2a5e8..c46f19a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03345e98af8f3d786b6d9f656ccfa6ac316d954e92bc4841f0bba20789d5fb5a" +checksum = "e7a2e47a1fbe209ee101dd6d61285226744c6c8d3c21c8dc878ba6cb9f467f3a" dependencies = [ "gimli", ] @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.40" +version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" +checksum = "15af2628f6890fe2609a3b91bef4c83450512802e59489f9c1cb1fa5df064a61" [[package]] name = "arrayvec" @@ -60,9 +60,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "assert_cmd" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f57fec1ac7e4de72dcc69811795f1a7172ed06012f80a5d1ee651b62484f588" +checksum = "a88b6bd5df287567ffdf4ddf4d33060048e1068308e5f62d81c6f9824a045a48" dependencies = [ "bstr", "doc-comment", @@ -105,9 +105,9 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "backtrace" -version = "0.3.59" +version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4717cfcbfaa661a0fd48f8453951837ae7e8f81e481fbb136e3202d72805a744" +checksum = "b7815ea54e4d821e791162e078acbebfd6d8c8939cd559c9335dceb1c8ca7282" dependencies = [ "addr2line", "cc", @@ -154,12 +154,6 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" -[[package]] -name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - [[package]] name = "cast" version = "0.2.6" @@ -224,7 +218,7 @@ version = "0.4.0" dependencies = [ "codegenrs", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", ] @@ -263,7 +257,7 @@ dependencies = [ "clap", "criterion-plot", "csv", - "itertools 0.10.0", + "itertools 0.10.1", "lazy_static", "num-traits", "oorandom", @@ -417,6 +411,9 @@ dependencies = [ name = "dictgen" version = "0.1.0" dependencies = [ + "phf 0.9.0", + "phf_codegen", + "phf_shared 0.9.0", "unicase", ] @@ -549,9 +546,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f" +checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "log", "termcolor", @@ -597,9 +594,9 @@ checksum = "0e4075386626662786ddb0ec9081e7c7eeb1ba31951f447ca780ef9f5d568189" [[package]] name = "globset" -version = "0.4.6" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c152169ef1e421390738366d2f796655fec62621dabbd0fd476f905934061e4a" +checksum = "10463d9ff00a2a068db14231982f5132edebad0d7660cd956a1c30292dbcbfbd" dependencies = [ "aho-corasick", "bstr", @@ -627,18 +624,18 @@ checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" [[package]] name = "heck" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" dependencies = [ "unicode-segmentation", ] [[package]] name = "hermit-abi" -version = "0.1.18" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] @@ -675,9 +672,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "ignore" -version = "0.4.17" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b287fb45c60bb826a0dc68ff08742b9d88a2fea13d6e0c286b3172065aaf878c" +checksum = "713f1b139373f96a2e0ce3ac931cd01ee973c3c5dd7c40c0c2efe96ad2b6751d" dependencies = [ "crossbeam-utils", "globset", @@ -702,9 +699,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" dependencies = [ "either", ] @@ -754,9 +751,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" +checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" [[package]] name = "log" @@ -804,7 +801,7 @@ version = "0.4.0" dependencies = [ "codegenrs", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "regex", "structopt", "unicase", @@ -859,15 +856,18 @@ dependencies = [ [[package]] name = "object" -version = "0.24.0" +version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a5b3dd1c072ee7963717671d1ca129f1048fda25edea6b752bfc71ac8854170" +checksum = "a38f2be3697a57b4060074ff41b44c16870d916ad7877c17696e063257482bc7" +dependencies = [ + "memchr", +] [[package]] name = "once_cell" -version = "1.7.2" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" [[package]] name = "oorandom" @@ -899,7 +899,36 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" dependencies = [ - "phf_shared", + "phf_shared 0.8.0", +] + +[[package]] +name = "phf" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ac8b67553a7ca9457ce0e526948cad581819238f4a9d1ea74545851fa24f37" +dependencies = [ + "phf_shared 0.9.0", +] + +[[package]] +name = "phf_codegen" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963adb11cf22ee65dfd401cf75577c1aa0eca58c0b97f9337d2da61d3e640503" +dependencies = [ + "phf_generator", + "phf_shared 0.9.0", +] + +[[package]] +name = "phf_generator" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc1437ada0f3a97d538f0bb608137bf53c53969028cab74c89893e1e9a12f0e" +dependencies = [ + "phf_shared 0.9.0", + "rand", ] [[package]] @@ -912,6 +941,16 @@ dependencies = [ "unicase", ] +[[package]] +name = "phf_shared" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68318426de33640f02be62b4ae8eb1261be2efbc337b60c54d845bf4484e0d9" +dependencies = [ + "siphasher", + "unicase", +] + [[package]] name = "plotters" version = "0.3.1" @@ -927,15 +966,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590" +checksum = "fd8be10f7485c8a323ea100b20d6052c27cf5968f08f8e3a56ee9f0cf38ebd3d" [[package]] name = "plotters-svg" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" dependencies = [ "plotters-backend", ] @@ -1037,9 +1076,9 @@ checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" [[package]] name = "rand" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" dependencies = [ "libc", "rand_chacha", @@ -1049,9 +1088,9 @@ dependencies = [ [[package]] name = "rand_chacha" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", @@ -1059,18 +1098,18 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" dependencies = [ "getrandom", ] [[package]] name = "rand_hc" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" dependencies = [ "rand_core", ] @@ -1102,9 +1141,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc" +checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" dependencies = [ "bitflags", ] @@ -1122,12 +1161,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" -dependencies = [ - "byteorder", -] +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" @@ -1146,9 +1182,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410f7acf3cb3a44527c5d9546bad4bf4e6c460915d5f9f2fc524498bfe8f70ce" +checksum = "dead70b0b5e03e9c814bcb6b01e03e68f7c57a80aa48c72ec92152ab3e818d49" [[package]] name = "rustc_version" @@ -1295,9 +1331,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.72" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82" +checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" dependencies = [ "proc-macro2", "quote", @@ -1408,7 +1444,7 @@ version = "0.7.0" dependencies = [ "anyhow", "bstr", - "itertools 0.10.0", + "itertools 0.10.1", "log", "nom", "once_cell", @@ -1437,15 +1473,15 @@ dependencies = [ "derive_setters", "difflib", "encoding", - "env_logger 0.8.3", + "env_logger 0.8.4", "human-panic", "ignore", - "itertools 0.10.0", + "itertools 0.10.1", "kstring", "log", "maplit", "once_cell", - "phf", + "phf 0.8.0", "predicates", "proc-exit", "serde", @@ -1478,7 +1514,7 @@ dependencies = [ "codegenrs", "csv", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", ] @@ -1490,7 +1526,7 @@ dependencies = [ "codegenrs", "csv", "edit-distance", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", "varcon", @@ -1515,7 +1551,7 @@ dependencies = [ "codegenrs", "dictgen", "env_logger 0.7.1", - "itertools 0.10.0", + "itertools 0.10.1", "log", "structopt", "typos", @@ -1698,7 +1734,7 @@ version = "0.4.0" dependencies = [ "codegenrs", "dictgen", - "itertools 0.10.0", + "itertools 0.10.1", "structopt", "unicase", ] diff --git a/crates/codespell-dict/codegen/Cargo.toml b/crates/codespell-dict/codegen/Cargo.toml index a0b3199..d67b7d6 100644 --- a/crates/codespell-dict/codegen/Cargo.toml +++ b/crates/codespell-dict/codegen/Cargo.toml @@ -22,4 +22,4 @@ unicase = "2.5" itertools = "0.10" codegenrs = "1.0" structopt = "0.3" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/dictgen/Cargo.toml b/crates/dictgen/Cargo.toml index e8426e6..be39397 100644 --- a/crates/dictgen/Cargo.toml +++ b/crates/dictgen/Cargo.toml @@ -1,7 +1,21 @@ [package] name = "dictgen" version = "0.1.0" +description = "Compile-time case-insensitive map" +repository = "https://github.com/crate-ci/typos" +categories = ["development-tools", "text-processing"] +keywords = ["development", "spelling", "no_std"] +license = "MIT" edition = "2018" +[features] +default = ["std"] +std = [] +codegen = ["std", "phf_codegen"] +map = ["phf", "phf_shared"] + [dependencies] unicase = "2.5" +phf = { version = "0.9", features = ["unicase"], optional = true } +phf_codegen = { version = "0.9", optional = true } +phf_shared = { version = "0.9", optional = true } diff --git a/crates/dictgen/src/lib.rs b/crates/dictgen/src/lib.rs index 5b9819b..c870b7f 100644 --- a/crates/dictgen/src/lib.rs +++ b/crates/dictgen/src/lib.rs @@ -1,5 +1,9 @@ +#[cfg(feature = "map")] +mod map; mod table; mod trie; +#[cfg(feature = "map")] +pub use map::*; pub use table::*; pub use trie::*; diff --git a/crates/dictgen/src/map.rs b/crates/dictgen/src/map.rs new file mode 100644 index 0000000..b7c84d2 --- /dev/null +++ b/crates/dictgen/src/map.rs @@ -0,0 +1,91 @@ +#[cfg(feature = "codegen")] +pub fn generate_map<'d, W: std::io::Write, V: std::fmt::Display>( + file: &mut W, + name: &str, + value_type: &str, + data: impl Iterator, +) -> Result<(), std::io::Error> { + let mut data: Vec<_> = data.collect(); + data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0)); + + let mut smallest = usize::MAX; + let mut largest = usize::MIN; + + writeln!( + file, + "pub static {}: dictgen::DictTable<{}> = dictgen::DictTable {{", + name, value_type + )?; + writeln!(file, " keys: &[")?; + for (key, _value) in data.iter() { + smallest = std::cmp::min(smallest, key.len()); + largest = std::cmp::max(largest, key.len()); + + let key = if key.is_ascii() { + format!("dictgen::InsensitiveStr::Ascii({:?})", key) + } else { + format!("dictgen::InsensitiveStr::Unicode({:?})", key) + }; + + writeln!(file, " {},", key)?; + } + if largest == 0 { + smallest = 0; + } + writeln!(file, " ],")?; + writeln!(file, " values: &[")?; + for (_key, value) in data.iter() { + writeln!(file, " {},", value)?; + } + writeln!(file, " ],")?; + writeln!(file, " range: {}..={},", smallest, largest)?; + writeln!(file, "}};")?; + + Ok(()) +} + +pub struct DictMap { + pub map: phf::Map, V>, + pub range: std::ops::RangeInclusive, +} + +impl DictMap { + pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> { + if self.range.contains(&word.len()) { + self.map.get(&(*word).into()) + } else { + None + } + } + + pub fn iter(&self) -> impl Iterator, &V)> + '_ { + self.map.entries().map(|(k, v)| (k.convert(), v)) + } +} + +impl<'s> phf_shared::PhfHash for crate::InsensitiveStr<'s> { + #[inline] + fn phf_hash(&self, state: &mut H) { + core::hash::Hash::hash(self, state) + } +} + +impl<'s> phf_shared::FmtConst for crate::InsensitiveStr<'s> { + fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?, + crate::InsensitiveStr::Unicode(_) => { + f.write_str("dictgen::InsensitiveStr::Unicode(")? + } + } + + self.into_inner().fmt_const(f)?; + f.write_str(")") + } +} + +impl<'b, 'a: 'b> phf_shared::PhfBorrow> for crate::InsensitiveStr<'a> { + fn borrow(&self) -> &crate::InsensitiveStr<'b> { + self + } +} diff --git a/crates/dictgen/src/table.rs b/crates/dictgen/src/table.rs index 30d5923..2452eed 100644 --- a/crates/dictgen/src/table.rs +++ b/crates/dictgen/src/table.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "codegen")] pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( file: &mut W, name: &str, @@ -44,9 +45,9 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( } pub struct DictTable { - pub keys: &'static [InsensitiveStr], + pub keys: &'static [InsensitiveStr<'static>], pub values: &'static [V], - pub range: std::ops::RangeInclusive, + pub range: core::ops::RangeInclusive, } impl DictTable { @@ -66,18 +67,64 @@ impl DictTable { } } -// Avoid unicase's use of const-fn so large tables don't OOM -#[derive(Copy, Clone, Debug)] -pub enum InsensitiveStr { - Unicode(&'static str), - Ascii(&'static str), +/// UniCase look-alike that avoids const-fn so large tables don't OOM +#[derive(Copy, Clone)] +pub enum InsensitiveStr<'s> { + Unicode(&'s str), + Ascii(&'s str), } -impl InsensitiveStr { - fn convert(self) -> unicase::UniCase<&'static str> { +impl<'s> InsensitiveStr<'s> { + pub fn convert(self) -> unicase::UniCase<&'s str> { match self { InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s), InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s), } } + + pub fn into_inner(self) -> &'s str { + match self { + InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s, + } + } +} + +impl<'s> From> for InsensitiveStr<'s> { + fn from(other: unicase::UniCase<&'s str>) -> Self { + if other.is_ascii() { + InsensitiveStr::Ascii(other.into_inner()) + } else { + InsensitiveStr::Unicode(other.into_inner()) + } + } +} + +impl<'s1, 's2> PartialEq> for InsensitiveStr<'s1> { + #[inline] + fn eq(&self, other: &InsensitiveStr<'s2>) -> bool { + self.convert() == other.convert() + } +} + +impl<'s> Eq for InsensitiveStr<'s> {} + +impl<'s> core::hash::Hash for InsensitiveStr<'s> { + #[inline] + fn hash(&self, hasher: &mut H) { + self.convert().hash(hasher) + } +} + +impl<'s> core::fmt::Debug for InsensitiveStr<'s> { + #[inline] + fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result { + core::fmt::Debug::fmt(self.into_inner(), fmt) + } +} + +impl<'s> core::fmt::Display for InsensitiveStr<'s> { + #[inline] + fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result { + core::fmt::Display::fmt(self.into_inner(), fmt) + } } diff --git a/crates/dictgen/src/trie.rs b/crates/dictgen/src/trie.rs index c6b849c..9218ea0 100644 --- a/crates/dictgen/src/trie.rs +++ b/crates/dictgen/src/trie.rs @@ -1,6 +1,7 @@ /// # Panics /// /// - On duplicate entry +#[cfg(feature = "codegen")] pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( file: &mut W, prefix: &str, @@ -8,123 +9,13 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( data: impl Iterator, limit: usize, ) -> Result<(), std::io::Error> { - let mut root = DynRoot::new(data); - root.burst(limit); - - let unicode_table_name = format!("{}_UNICODE_TABLE", prefix); - - writeln!( - file, - "pub static {}_TRIE: dictgen::DictTrie<{}> = dictgen::DictTrie {{", - prefix, value_type - )?; - writeln!(file, " root: &{},", gen_node_name(prefix, ""))?; - writeln!(file, " unicode: &{},", &unicode_table_name)?; - writeln!( - file, - " range: {}..={},", - root.range.start(), - root.range.end() - )?; - writeln!(file, "}};")?; - writeln!(file)?; - - crate::generate_table( - file, - &unicode_table_name, - value_type, - root.unicode.into_iter(), - )?; - writeln!(file)?; - - let mut nodes = vec![("".to_owned(), &root.root)]; - while let Some((start, node)) = nodes.pop() { - let node_name = gen_node_name(prefix, &start); - let children_name = gen_children_name(prefix, &start); - writeln!( - file, - "static {}: dictgen::DictTrieNode<{}> = dictgen::DictTrieNode {{", - node_name, value_type - )?; - writeln!( - file, - " children: {}(&{}),", - gen_type_name(&node.children), - children_name - )?; - if let Some(value) = node.value.as_ref() { - writeln!(file, " value: Some({}),", value)?; - } else { - writeln!(file, " value: None,")?; - } - writeln!(file, "}};")?; - writeln!(file)?; - - match &node.children { - DynChild::Nested(n) => { - writeln!( - file, - "static {}: [Option<&dictgen::DictTrieNode<{}>>; 26] = [", - children_name, value_type, - )?; - for b in b'a'..=b'z' { - if let Some(child) = n.get(&b) { - let c = b as char; - let next_start = format!("{}{}", start, c); - writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?; - nodes.push((next_start, child)); - } else { - writeln!(file, " None,")?; - } - } - writeln!(file, "];")?; - } - DynChild::Flat(v) => { - let table_input = v.iter().map(|(k, v)| { - let k = std::str::from_utf8(k).expect("this was originally a `str`"); - (k, v) - }); - crate::generate_table(file, &children_name, value_type, table_input)?; - } - } - writeln!(file)?; - writeln!(file)?; - } - - Ok(()) -} - -fn gen_node_name(prefix: &str, start: &str) -> String { - if start.is_empty() { - format!("{}_NODE", prefix) - } else { - let mut start = start.to_owned(); - start.make_ascii_uppercase(); - format!("{}_{}_NODE", prefix, start) - } -} - -fn gen_children_name(prefix: &str, start: &str) -> String { - if start.is_empty() { - format!("{}_CHILDREN", prefix) - } else { - let mut start = start.to_owned(); - start.make_ascii_uppercase(); - format!("{}_{}_CHILDREN", prefix, start) - } -} - -fn gen_type_name(leaf: &DynChild) -> &'static str { - match leaf { - DynChild::Nested(_) => "dictgen::DictTrieChild::Nested", - DynChild::Flat(_) => "dictgen::DictTrieChild::Flat", - } + codegen::generate_trie(file, prefix, value_type, data, limit) } pub struct DictTrie { pub root: &'static DictTrieNode, pub unicode: &'static crate::DictTable, - pub range: std::ops::RangeInclusive, + pub range: core::ops::RangeInclusive, } impl DictTrie { @@ -155,7 +46,7 @@ impl DictTrie { let remaining = &bytes[i..bytes.len()]; // Unsafe: Everything before has been proven to be ASCII, so this should be // safe. - let remaining = unsafe { std::str::from_utf8_unchecked(remaining) }; + let remaining = unsafe { core::str::from_utf8_unchecked(remaining) }; // Reuse the prior ascii check, rather than doing it again let remaining = if word.is_ascii() { unicase::UniCase::ascii(remaining) @@ -183,107 +74,230 @@ pub enum DictTrieChild { Flat(&'static crate::DictTable), } -struct DynRoot<'s, V> { - root: DynNode<'s, V>, - unicode: Vec<(&'s str, V)>, - range: std::ops::RangeInclusive, -} +#[cfg(feature = "codegen")] +mod codegen { + pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( + file: &mut W, + prefix: &str, + value_type: &str, + data: impl Iterator, + limit: usize, + ) -> Result<(), std::io::Error> { + let mut root = DynRoot::new(data); + root.burst(limit); -impl<'s, V> DynRoot<'s, V> { - fn new(data: impl Iterator) -> Self { - let mut overflow = Vec::new(); - let mut unicode = Vec::default(); - let mut smallest = usize::MAX; - let mut largest = usize::MIN; - let mut existing = std::collections::HashSet::new(); - let mut empty = None; - for (key, value) in data { - if existing.contains(key) { - panic!("Duplicate present: {}", key); - } - existing.insert(key); + let unicode_table_name = format!("{}_UNICODE_TABLE", prefix); - if key.is_empty() { - empty = Some(value); + writeln!( + file, + "pub static {}_TRIE: dictgen::DictTrie<{}> = dictgen::DictTrie {{", + prefix, value_type + )?; + writeln!(file, " root: &{},", gen_node_name(prefix, ""))?; + writeln!(file, " unicode: &{},", &unicode_table_name)?; + writeln!( + file, + " range: {}..={},", + root.range.start(), + root.range.end() + )?; + writeln!(file, "}};")?; + writeln!(file)?; + + crate::generate_table( + file, + &unicode_table_name, + value_type, + root.unicode.into_iter(), + )?; + writeln!(file)?; + + let mut nodes = vec![("".to_owned(), &root.root)]; + while let Some((start, node)) = nodes.pop() { + let node_name = gen_node_name(prefix, &start); + let children_name = gen_children_name(prefix, &start); + writeln!( + file, + "static {}: dictgen::DictTrieNode<{}> = dictgen::DictTrieNode {{", + node_name, value_type + )?; + writeln!( + file, + " children: {}(&{}),", + gen_type_name(&node.children), + children_name + )?; + if let Some(value) = node.value.as_ref() { + writeln!(file, " value: Some({}),", value)?; } else { - smallest = std::cmp::min(smallest, key.len()); - largest = std::cmp::max(largest, key.len()); - if key.bytes().all(|b| b.is_ascii_alphabetic()) { - overflow.push((key.as_bytes(), value)); - } else { - unicode.push((key, value)); - } + writeln!(file, " value: None,")?; } - } - Self { - root: DynNode { - children: DynChild::Flat(overflow), - value: empty, - }, - unicode, - range: smallest..=largest, - } - } + writeln!(file, "}};")?; + writeln!(file)?; - fn burst(&mut self, limit: usize) { - self.root.burst(limit); - } -} - -struct DynNode<'s, V> { - children: DynChild<'s, V>, - value: Option, -} - -impl<'s, V> DynNode<'s, V> { - fn burst(&mut self, limit: usize) { - self.children.burst(limit) - } -} - -enum DynChild<'s, V> { - Nested(std::collections::BTreeMap>), - Flat(Vec<(&'s [u8], V)>), -} - -impl<'s, V> DynChild<'s, V> { - fn burst(&mut self, limit: usize) { - match self { - DynChild::Nested(children) => { - for child in children.values_mut() { - child.burst(limit); + match &node.children { + DynChild::Nested(n) => { + writeln!( + file, + "static {}: [Option<&dictgen::DictTrieNode<{}>>; 26] = [", + children_name, value_type, + )?; + for b in b'a'..=b'z' { + if let Some(child) = n.get(&b) { + let c = b as char; + let next_start = format!("{}{}", start, c); + writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?; + nodes.push((next_start, child)); + } else { + writeln!(file, " None,")?; + } + } + writeln!(file, "];")?; } - } - DynChild::Flat(v) if v.len() < limit => (), - DynChild::Flat(v) => { - let mut old_v = Vec::new(); - std::mem::swap(&mut old_v, v); - let mut nodes = std::collections::BTreeMap::new(); - for (key, value) in old_v { - assert!(!key.is_empty()); - let start = key[0].to_ascii_lowercase(); - assert!(start.is_ascii_alphabetic()); - let node = nodes.entry(start).or_insert_with(|| DynNode { - children: DynChild::Flat(Vec::new()), - value: None, + DynChild::Flat(v) => { + let table_input = v.iter().map(|(k, v)| { + let k = std::str::from_utf8(k).expect("this was originally a `str`"); + (k, v) }); - let remaining = &key[1..]; - if remaining.is_empty() { - assert!(node.value.is_none()); - node.value = Some(value); + crate::generate_table(file, &children_name, value_type, table_input)?; + } + } + writeln!(file)?; + writeln!(file)?; + } + + Ok(()) + } + + fn gen_node_name(prefix: &str, start: &str) -> String { + if start.is_empty() { + format!("{}_NODE", prefix) + } else { + let mut start = start.to_owned(); + start.make_ascii_uppercase(); + format!("{}_{}_NODE", prefix, start) + } + } + + fn gen_children_name(prefix: &str, start: &str) -> String { + if start.is_empty() { + format!("{}_CHILDREN", prefix) + } else { + let mut start = start.to_owned(); + start.make_ascii_uppercase(); + format!("{}_{}_CHILDREN", prefix, start) + } + } + + fn gen_type_name(leaf: &DynChild) -> &'static str { + match leaf { + DynChild::Nested(_) => "dictgen::DictTrieChild::Nested", + DynChild::Flat(_) => "dictgen::DictTrieChild::Flat", + } + } + + struct DynRoot<'s, V> { + root: DynNode<'s, V>, + unicode: Vec<(&'s str, V)>, + range: std::ops::RangeInclusive, + } + + impl<'s, V> DynRoot<'s, V> { + fn new(data: impl Iterator) -> Self { + let mut overflow = Vec::new(); + let mut unicode = Vec::default(); + let mut smallest = usize::MAX; + let mut largest = usize::MIN; + let mut existing = std::collections::HashSet::new(); + let mut empty = None; + for (key, value) in data { + if existing.contains(key) { + panic!("Duplicate present: {}", key); + } + existing.insert(key); + + if key.is_empty() { + empty = Some(value); + } else { + smallest = std::cmp::min(smallest, key.len()); + largest = std::cmp::max(largest, key.len()); + if key.bytes().all(|b| b.is_ascii_alphabetic()) { + overflow.push((key.as_bytes(), value)); } else { - match &mut node.children { - DynChild::Nested(_) => { - unreachable!("Only overflow at this point") - } - DynChild::Flat(ref mut v) => { - v.push((remaining, value)); + unicode.push((key, value)); + } + } + } + Self { + root: DynNode { + children: DynChild::Flat(overflow), + value: empty, + }, + unicode, + range: smallest..=largest, + } + } + + fn burst(&mut self, limit: usize) { + self.root.burst(limit); + } + } + + struct DynNode<'s, V> { + children: DynChild<'s, V>, + value: Option, + } + + impl<'s, V> DynNode<'s, V> { + fn burst(&mut self, limit: usize) { + self.children.burst(limit) + } + } + + enum DynChild<'s, V> { + Nested(std::collections::BTreeMap>), + Flat(Vec<(&'s [u8], V)>), + } + + impl<'s, V> DynChild<'s, V> { + fn burst(&mut self, limit: usize) { + match self { + DynChild::Nested(children) => { + for child in children.values_mut() { + child.burst(limit); + } + } + DynChild::Flat(v) if v.len() < limit => (), + DynChild::Flat(v) => { + let mut old_v = Vec::new(); + std::mem::swap(&mut old_v, v); + let mut nodes = std::collections::BTreeMap::new(); + for (key, value) in old_v { + assert!(!key.is_empty()); + let start = key[0].to_ascii_lowercase(); + assert!(start.is_ascii_alphabetic()); + let node = nodes.entry(start).or_insert_with(|| DynNode { + children: DynChild::Flat(Vec::new()), + value: None, + }); + let remaining = &key[1..]; + if remaining.is_empty() { + assert!(node.value.is_none()); + node.value = Some(value); + } else { + match &mut node.children { + DynChild::Nested(_) => { + unreachable!("Only overflow at this point") + } + DynChild::Flat(ref mut v) => { + v.push((remaining, value)); + } } } } + *self = DynChild::Nested(nodes); + self.burst(limit); } - *self = DynChild::Nested(nodes); - self.burst(limit); } } } diff --git a/crates/misspell-dict/codegen/Cargo.toml b/crates/misspell-dict/codegen/Cargo.toml index 90f0511..a60d29b 100644 --- a/crates/misspell-dict/codegen/Cargo.toml +++ b/crates/misspell-dict/codegen/Cargo.toml @@ -23,4 +23,4 @@ itertools = "0.10" codegenrs = "1.0" structopt = "0.3" regex = "1" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/typos-dict/codegen/Cargo.toml b/crates/typos-dict/codegen/Cargo.toml index 5e1c2bc..ed4a19e 100644 --- a/crates/typos-dict/codegen/Cargo.toml +++ b/crates/typos-dict/codegen/Cargo.toml @@ -23,4 +23,4 @@ itertools = "0.10" unicase = "2.5" codegenrs = "1.0" structopt = "0.3" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/typos-vars/codegen/Cargo.toml b/crates/typos-vars/codegen/Cargo.toml index 5194326..0f06d4a 100644 --- a/crates/typos-vars/codegen/Cargo.toml +++ b/crates/typos-vars/codegen/Cargo.toml @@ -29,4 +29,4 @@ log = "0.4" env_logger = "0.7" clap-verbosity-flag = "0.3" itertools = "0.10" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/wikipedia-dict/codegen/Cargo.toml b/crates/wikipedia-dict/codegen/Cargo.toml index 93fe957..d7713f0 100644 --- a/crates/wikipedia-dict/codegen/Cargo.toml +++ b/crates/wikipedia-dict/codegen/Cargo.toml @@ -22,4 +22,4 @@ unicase = "2.5" itertools = "0.10" codegenrs = "1.0" structopt = "0.3" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }