Merge pull request #1198 from epage/generic

perf(dict)!: Switch to PHF Map
2025-01-11 01:01:36 -05:00 · 2024-12-31 06:56:23 -06:00 · 2024-12-31 06:56:23 -06:00 · 44cf2f8cf6
commit 44cf2f8cf6
parent 086f9d1558 7457534850
20 changed files with 909056 additions and 704967 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1509,6 +1509,7 @@ dependencies = [
 "dictgen",
 "divan",
 "edit-distance",
+ "heck",
 "indexmap",
 "itertools 0.13.0",
 "phf",
--- a/crates/codespell-dict/src/dict_codegen.rs
+++ b/crates/codespell-dict/src/dict_codegen.rs
--- a/crates/dictgen/src/gen.rs
+++ b/crates/dictgen/src/gen.rs
@ -36,11 +36,19 @@ impl<'g> DictGen<'g> {

    #[cfg(feature = "map")]
    pub fn map(self) -> crate::MapGen<'g> {
-        crate::MapGen { gen: self }
+        crate::MapGen {
+            gen: self,
+            unicode: true,
+            unicase: true,
+        }
    }

    pub fn ordered_map(self) -> crate::OrderedMapGen<'g> {
-        crate::OrderedMapGen { gen: self }
+        crate::OrderedMapGen {
+            gen: self,
+            unicode: true,
+            unicase: true,
+        }
    }

    pub fn trie(self) -> crate::TrieGen<'g> {
@ -49,6 +57,10 @@ impl<'g> DictGen<'g> {
            limit: 64,
        }
    }
+
+    pub fn r#match(self) -> crate::MatchGen<'g> {
+        crate::MatchGen { gen: self }
+    }
 }

 impl Default for DictGen<'static> {
--- a/crates/dictgen/src/insensitive.rs
+++ b/crates/dictgen/src/insensitive.rs
@ -51,6 +51,18 @@ impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {

 impl Eq for InsensitiveStr<'_> {}

+impl PartialOrd for InsensitiveStr<'_> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for InsensitiveStr<'_> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.convert().cmp(&other.convert())
+    }
+}
+
 impl core::hash::Hash for InsensitiveStr<'_> {
    #[inline]
    fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
@ -101,3 +113,97 @@ impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a
        self
    }
 }
+
+/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
+#[derive(Copy, Clone)]
+pub struct InsensitiveAscii<'s>(pub &'s str);
+
+impl<'s> InsensitiveAscii<'s> {
+    pub fn convert(self) -> unicase::Ascii<&'s str> {
+        unicase::Ascii::new(self.0)
+    }
+
+    pub fn into_inner(self) -> &'s str {
+        self.0
+    }
+
+    pub fn is_empty(self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn len(self) -> usize {
+        self.0.len()
+    }
+}
+
+impl<'s> From<unicase::Ascii<&'s str>> for InsensitiveAscii<'s> {
+    fn from(other: unicase::Ascii<&'s str>) -> Self {
+        Self(other.into_inner())
+    }
+}
+
+impl<'s2> PartialEq<InsensitiveAscii<'s2>> for InsensitiveAscii<'_> {
+    #[inline]
+    fn eq(&self, other: &InsensitiveAscii<'s2>) -> bool {
+        self.convert() == other.convert()
+    }
+}
+
+impl Eq for InsensitiveAscii<'_> {}
+
+impl PartialOrd for InsensitiveAscii<'_> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for InsensitiveAscii<'_> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.convert().cmp(&other.convert())
+    }
+}
+
+impl core::hash::Hash for InsensitiveAscii<'_> {
+    #[inline]
+    fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
+        self.convert().hash(hasher);
+    }
+}
+
+impl core::fmt::Debug for InsensitiveAscii<'_> {
+    #[inline]
+    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        core::fmt::Debug::fmt(self.into_inner(), fmt)
+    }
+}
+
+impl core::fmt::Display for InsensitiveAscii<'_> {
+    #[inline]
+    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        core::fmt::Display::fmt(self.into_inner(), fmt)
+    }
+}
+
+#[cfg(feature = "map")]
+impl phf_shared::PhfHash for InsensitiveAscii<'_> {
+    #[inline]
+    fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
+        core::hash::Hash::hash(self, state);
+    }
+}
+
+#[cfg(feature = "map")]
+impl phf_shared::FmtConst for InsensitiveAscii<'_> {
+    fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.write_str("dictgen::InsensitiveAscii(")?;
+        self.into_inner().fmt_const(f)?;
+        f.write_str(")")
+    }
+}
+
+#[cfg(feature = "map")]
+impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveAscii<'b>> for InsensitiveAscii<'a> {
+    fn borrow(&self) -> &InsensitiveAscii<'b> {
+        self
+    }
+}
--- a/crates/dictgen/src/lib.rs
+++ b/crates/dictgen/src/lib.rs
@ -7,6 +7,8 @@ mod gen;
 mod insensitive;
 #[cfg(feature = "map")]
 mod map;
+#[cfg(feature = "codegen")]
+mod r#match;
 mod ordered_map;
 mod trie;

@ -16,4 +18,6 @@ pub use insensitive::*;
 #[cfg(feature = "map")]
 pub use map::*;
 pub use ordered_map::*;
+#[cfg(feature = "codegen")]
+pub use r#match::*;
 pub use trie::*;
--- a/crates/dictgen/src/map.rs
+++ b/crates/dictgen/src/map.rs
@ -1,65 +1,120 @@
 #[cfg(feature = "codegen")]
 pub struct MapGen<'g> {
    pub(crate) gen: crate::DictGen<'g>,
+    pub(crate) unicase: bool,
+    pub(crate) unicode: bool,
 }

 #[cfg(feature = "codegen")]
 impl MapGen<'_> {
-    pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
+    pub fn unicase(mut self, yes: bool) -> Self {
+        self.unicase = yes;
+        self
+    }
+
+    pub fn unicode(mut self, yes: bool) -> Self {
+        self.unicode = yes;
+        self
+    }
+
+    pub fn write<W: std::io::Write, V: std::fmt::Display>(
        &self,
        file: &mut W,
-        data: impl Iterator<Item = (&'d str, V)>,
+        data: impl Iterator<Item = (impl AsRef<str>, V)>,
    ) -> Result<(), std::io::Error> {
        let mut data: Vec<_> = data.collect();
-        data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
+        data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));

        let name = self.gen.name;
+        let key_type = self.key_type();
        let value_type = self.gen.value_type;

        let mut smallest = usize::MAX;
        let mut largest = usize::MIN;
-        let mut builder = phf_codegen::Map::new();
-        let data = data
-            .iter()
-            .map(|(key, value)| {
-                (
-                    if key.is_ascii() {
-                        crate::InsensitiveStr::Ascii(key)
-                    } else {
-                        crate::InsensitiveStr::Unicode(key)
-                    },
-                    value.to_string(),
-                )
-            })
-            .collect::<Vec<_>>();
-        for (key, value) in data.iter() {
+        for (key, _) in data.iter() {
+            let key = key.as_ref();
            smallest = std::cmp::min(smallest, key.len());
            largest = std::cmp::max(largest, key.len());
-            builder.entry(key, value.as_str());
        }
-        let builder = builder.build();
        if largest == 0 {
            smallest = 0;
        }

        writeln!(
            file,
-            "pub static {name}: dictgen::Map<{value_type}> = dictgen::Map {{"
+            "pub static {name}: dictgen::Map<{key_type}, {value_type}> = dictgen::Map {{"
        )?;
-        writeln!(file, "    map: {builder},")?;
+
+        match (self.unicase, self.unicode) {
+            (true, true) => {
+                let mut builder = phf_codegen::Map::new();
+                let data = data
+                    .iter()
+                    .map(|(key, value)| {
+                        let key = key.as_ref();
+                        (
+                            if key.is_ascii() {
+                                crate::InsensitiveStr::Ascii(key)
+                            } else {
+                                crate::InsensitiveStr::Unicode(key)
+                            },
+                            value.to_string(),
+                        )
+                    })
+                    .collect::<Vec<_>>();
+                for (key, value) in data.iter() {
+                    builder.entry(key, value.as_str());
+                }
+                let builder = builder.build();
+                writeln!(file, "    map: {builder},")?;
+            }
+            (true, false) => {
+                let mut builder = phf_codegen::Map::new();
+                let data = data
+                    .iter()
+                    .map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string()))
+                    .collect::<Vec<_>>();
+                for (key, value) in data.iter() {
+                    builder.entry(key, value.as_str());
+                }
+                let builder = builder.build();
+                writeln!(file, "    map: {builder},")?;
+            }
+            (false, _) => {
+                let mut builder = phf_codegen::Map::new();
+                let data = data
+                    .iter()
+                    .map(|(key, value)| (key, value.to_string()))
+                    .collect::<Vec<_>>();
+                for (key, value) in data.iter() {
+                    builder.entry(key.as_ref(), value.as_str());
+                }
+                let builder = builder.build();
+                writeln!(file, "    map: {builder},")?;
+            }
+        }
+
        writeln!(file, "    range: {smallest}..={largest},")?;
        writeln!(file, "}};")?;

        Ok(())
    }
+
+    fn key_type(&self) -> &'static str {
+        match (self.unicase, self.unicode) {
+            (true, true) => "dictgen::InsensitiveStr<'static>",
+            (true, false) => "dictgen::InsensitiveAscii<'static>",
+            (false, _) => "&'static str",
+        }
+    }
 }

-pub struct Map<V: 'static> {
-    pub map: phf::Map<crate::InsensitiveStr<'static>, V>,
+pub struct Map<K: 'static, V: 'static> {
+    pub map: phf::Map<K, V>,
    pub range: std::ops::RangeInclusive<usize>,
 }

-impl<V> Map<V> {
+impl<V> Map<crate::InsensitiveStr<'_>, V> {
    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
        if self.range.contains(&word.len()) {
@ -69,3 +124,25 @@ impl<V> Map<V> {
        }
    }
 }
+
+impl<V> Map<crate::InsensitiveAscii<'_>, V> {
+    #[inline]
+    pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&V> {
+        if self.range.contains(&word.len()) {
+            self.map.get(&(*word).into())
+        } else {
+            None
+        }
+    }
+}
+
+impl<V> Map<&str, V> {
+    #[inline]
+    pub fn find(&self, word: &'_ &str) -> Option<&V> {
+        if self.range.contains(&word.len()) {
+            self.map.get(word)
+        } else {
+            None
+        }
+    }
+}
--- a/crates/dictgen/src/match.rs
+++ b/crates/dictgen/src/match.rs
@ -0,0 +1,37 @@
+#[cfg(feature = "codegen")]
+pub struct MatchGen<'g> {
+    pub(crate) gen: crate::DictGen<'g>,
+}
+
+#[cfg(feature = "codegen")]
+impl MatchGen<'_> {
+    pub fn write<W: std::io::Write, V: std::fmt::Display>(
+        &self,
+        file: &mut W,
+        data: impl Iterator<Item = (impl AsRef<str>, V)>,
+    ) -> Result<(), std::io::Error> {
+        let mut data: Vec<_> = data.collect();
+        data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
+
+        let name = self.gen.name;
+        let value_type = self.gen.value_type;
+
+        writeln!(file, "pub struct {name};")?;
+        writeln!(file, "impl {name} {{")?;
+        writeln!(
+            file,
+            "    pub fn find(&self, word: &&str) -> Option<&'static {value_type}> {{"
+        )?;
+        writeln!(file, "        match *word {{")?;
+        for (key, value) in data.iter() {
+            let key = key.as_ref();
+            writeln!(file, "            {key:?} => Some(&{value}.as_slice()),")?;
+        }
+        writeln!(file, "            _ => None,")?;
+        writeln!(file, "        }}")?;
+        writeln!(file, "    }}")?;
+        writeln!(file, "}}")?;
+
+        Ok(())
+    }
+}
--- a/crates/dictgen/src/ordered_map.rs
+++ b/crates/dictgen/src/ordered_map.rs
@ -1,19 +1,32 @@
 #[cfg(feature = "codegen")]
 pub struct OrderedMapGen<'g> {
    pub(crate) gen: crate::DictGen<'g>,
+    pub(crate) unicase: bool,
+    pub(crate) unicode: bool,
 }

 #[cfg(feature = "codegen")]
 impl OrderedMapGen<'_> {
-    pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
+    pub fn unicase(mut self, yes: bool) -> Self {
+        self.unicase = yes;
+        self
+    }
+
+    pub fn unicode(mut self, yes: bool) -> Self {
+        self.unicode = yes;
+        self
+    }
+
+    pub fn write<W: std::io::Write, V: std::fmt::Display>(
        &self,
        file: &mut W,
-        data: impl Iterator<Item = (&'d str, V)>,
+        data: impl Iterator<Item = (impl AsRef<str>, V)>,
    ) -> Result<(), std::io::Error> {
        let mut data: Vec<_> = data.collect();
-        data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
+        data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));

        let name = self.gen.name;
+        let key_type = self.key_type();
        let value_type = self.gen.value_type;

        let mut smallest = usize::MAX;
@ -21,18 +34,15 @@ impl OrderedMapGen<'_> {

        writeln!(
            file,
-            "pub static {name}: dictgen::OrderedMap<{value_type}> = dictgen::OrderedMap {{"
+            "pub static {name}: dictgen::OrderedMap<{key_type}, {value_type}> = dictgen::OrderedMap {{"
        )?;
        writeln!(file, "    keys: &[")?;
        for (key, _value) in data.iter() {
+            let key = key.as_ref();
            smallest = std::cmp::min(smallest, key.len());
            largest = std::cmp::max(largest, key.len());

-            let key = if key.is_ascii() {
-                format!("dictgen::InsensitiveStr::Ascii({key:?})")
-            } else {
-                format!("dictgen::InsensitiveStr::Unicode({key:?})")
-            };
+            let key = self.key_new(key);

            writeln!(file, "      {key},")?;
        }
@ -50,15 +60,37 @@ impl OrderedMapGen<'_> {

        Ok(())
    }
+
+    fn key_type(&self) -> &'static str {
+        match (self.unicase, self.unicode) {
+            (true, true) => "dictgen::InsensitiveStr<'static>",
+            (true, false) => "dictgen::InsensitiveAscii<'static>",
+            (false, _) => "&'static str",
+        }
+    }
+
+    fn key_new(&self, key: &str) -> String {
+        match (self.unicase, self.unicode) {
+            (true, true) => {
+                if key.is_ascii() {
+                    format!("dictgen::InsensitiveStr::Ascii({key:?})")
+                } else {
+                    format!("dictgen::InsensitiveStr::Unicode({key:?})")
+                }
+            }
+            (true, false) => format!("dictgen::InsensitiveAscii({key:?})"),
+            (false, _) => format!("{key:?}"),
+        }
+    }
 }

-pub struct OrderedMap<V: 'static> {
-    pub keys: &'static [crate::InsensitiveStr<'static>],
+pub struct OrderedMap<K: 'static, V: 'static> {
+    pub keys: &'static [K],
    pub values: &'static [V],
    pub range: core::ops::RangeInclusive<usize>,
 }

-impl<V> OrderedMap<V> {
+impl<V> OrderedMap<crate::InsensitiveStr<'_>, V> {
    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
        if self.range.contains(&word.len()) {
@ -71,3 +103,28 @@ impl<V> OrderedMap<V> {
        }
    }
 }
+
+impl<V> OrderedMap<crate::InsensitiveAscii<'_>, V> {
+    #[inline]
+    pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&'static V> {
+        if self.range.contains(&word.len()) {
+            self.keys
+                .binary_search_by_key(word, |key| key.convert())
+                .map(|i| &self.values[i])
+                .ok()
+        } else {
+            None
+        }
+    }
+}
+
+impl<V> OrderedMap<&str, V> {
+    #[inline]
+    pub fn find(&self, word: &'_ &str) -> Option<&'static V> {
+        if self.range.contains(&word.len()) {
+            self.keys.binary_search(word).map(|i| &self.values[i]).ok()
+        } else {
+            None
+        }
+    }
+}
--- a/crates/dictgen/src/trie.rs
+++ b/crates/dictgen/src/trie.rs
@ -27,7 +27,7 @@ impl TrieGen<'_> {

 pub struct Trie<V: 'static> {
    pub root: &'static TrieNode<V>,
-    pub unicode: &'static crate::OrderedMap<V>,
+    pub unicode: &'static crate::OrderedMap<crate::InsensitiveStr<'static>, V>,
    pub range: core::ops::RangeInclusive<usize>,
 }

@ -75,7 +75,7 @@ impl<V> Trie<V> {
                    // Unsafe: Everything before has been proven to be ASCII, so this should be
                    // safe.
                    let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
-                    let remaining = unicase::UniCase::ascii(remaining);
+                    let remaining = unicase::Ascii::new(remaining);
                    return t.find(&remaining);
                }
            }
@ -91,7 +91,7 @@ pub struct TrieNode<V: 'static> {

 pub enum TrieChild<V: 'static> {
    Nested(&'static [Option<&'static TrieNode<V>>; 26]),
-    Flat(&'static crate::OrderedMap<V>),
+    Flat(&'static crate::OrderedMap<crate::InsensitiveAscii<'static>, V>),
 }

 #[cfg(feature = "codegen")]
@ -179,6 +179,7 @@ mod codegen {
                        .name(&children_name)
                        .value_type(value_type)
                        .ordered_map()
+                        .unicode(false)
                        .write(file, table_input)?;
                }
            }
--- a/crates/misspell-dict/src/dict_codegen.rs
+++ b/crates/misspell-dict/src/dict_codegen.rs
--- a/crates/typos-dict/Cargo.toml
+++ b/crates/typos-dict/Cargo.toml
@ -16,7 +16,8 @@ all-features = true
 rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]

 [dependencies]
-dictgen = { version = "^0.2", path = "../dictgen" }
+phf = "0.11.2"
+dictgen = { version = "^0.2", path = "../dictgen", features = ["map"] }

 [dev-dependencies]
 csv = "1.3"
@ -29,7 +30,7 @@ varcon = { version = "^1.0", path = "../varcon" }
 snapbox = "0.6.5"
 indexmap = "2.2.6"
 divan = "0.1.16"
-phf = "0.11.2"
+heck = "0.5.0"

 [lints]
 workspace = true
--- a/crates/typos-dict/benches/benches/cased_map_codegen.rs
+++ b/crates/typos-dict/benches/benches/cased_map_codegen.rs
--- a/crates/typos-dict/benches/benches/main.rs
+++ b/crates/typos-dict/benches/benches/main.rs
@ -1,5 +1,7 @@
 #![allow(clippy::wildcard_imports)]
+#![allow(dead_code)]

+mod cased_map_codegen;
 mod map_codegen;
 mod ordered_map_codegen;
 mod trie_codegen;
@ -9,6 +11,11 @@ mod miss {

    const MISS: &str = "finalizes";

+    #[divan::bench(args = [MISS])]
+    fn cased_map(word: &str) -> Option<&'static &[&str]> {
+        cased_map_codegen::WORD_ASCII_LOWER.find(&word)
+    }
+
    #[divan::bench(args = [unicase::UniCase::new(MISS)])]
    fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
        map_codegen::WORD.find(&word)
@ -30,6 +37,11 @@ mod hit {

    const HIT: &str = "finallizes";

+    #[divan::bench(args = [HIT])]
+    fn cased_map(word: &str) -> Option<&'static &[&str]> {
+        cased_map_codegen::WORD_ASCII_LOWER.find(&word)
+    }
+
    #[divan::bench(args = [unicase::UniCase::new(HIT)])]
    fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
        map_codegen::WORD.find(&word)
--- a/crates/typos-dict/benches/benches/map_codegen.rs
+++ b/crates/typos-dict/benches/benches/map_codegen.rs
@ -2,7 +2,7 @@
 #![allow(clippy::unreadable_literal)]
 #![allow(unreachable_pub)]

-pub static WORD: dictgen::Map<&[&str]> = dictgen::Map {
+pub static WORD: dictgen::Map<dictgen::InsensitiveStr<'static>, &[&str]> = dictgen::Map {
    map: ::phf::Map {
        key: 12913932095322966823,
        disps: &[
--- a/crates/typos-dict/benches/benches/ordered_map_codegen.rs
+++ b/crates/typos-dict/benches/benches/ordered_map_codegen.rs
--- a/crates/typos-dict/benches/benches/trie_codegen.rs
+++ b/crates/typos-dict/benches/benches/trie_codegen.rs
--- a/crates/typos-dict/src/word_codegen.rs
+++ b/crates/typos-dict/src/word_codegen.rs
--- a/crates/typos-dict/tests/codegen.rs
+++ b/crates/typos-dict/tests/codegen.rs
@ -20,6 +20,15 @@ fn codegen() {
        snapbox::file!["../benches/benches/map_codegen.rs"].raw()
    );

+    let mut cased_map_content = vec![];
+    generate_cased_map(&mut cased_map_content, "WORD", DICT);
+    let cased_map_content = String::from_utf8(cased_map_content).unwrap();
+    let cased_map_content = codegenrs::rustfmt(&cased_map_content, None).unwrap();
+    snapbox::assert_data_eq!(
+        &cased_map_content,
+        snapbox::file!["../benches/benches/cased_map_codegen.rs"].raw()
+    );
+
    let mut ordered_map_content = vec![];
    generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
    let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
@ -29,10 +38,7 @@ fn codegen() {
        snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
    );

-    snapbox::assert_data_eq!(
-        &trie_content,
-        snapbox::file!["../src/word_codegen.rs"].raw()
-    );
+    snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
 }

 fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
@ -72,6 +78,111 @@ fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
        .unwrap();
 }

+fn generate_cased_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
+    writeln!(
+        file,
+        "// This file is @generated by {}",
+        file!().replace('\\', "/")
+    )
+    .unwrap();
+    writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
+    writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
+    writeln!(file).unwrap();
+
+    let records: Vec<_> = csv::ReaderBuilder::new()
+        .has_headers(false)
+        .flexible(true)
+        .from_reader(dict)
+        .records()
+        .map(|r| r.unwrap())
+        .collect();
+    dictgen::DictGen::new()
+        .name(&format!("{name}_ASCII_LOWER"))
+        .value_type("&[&str]")
+        .map()
+        .unicase(false)
+        .write(
+            file,
+            records
+                .iter()
+                .filter(|r| r.iter().next().unwrap().is_ascii())
+                .map(|record| {
+                    let mut record_fields = record.iter();
+                    let key = record_fields.next().unwrap();
+                    let value = format!(
+                        "&[{}]",
+                        itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
+                    );
+                    (key, value)
+                }),
+        )
+        .unwrap();
+    dictgen::DictGen::new()
+        .name(&format!("{name}_ASCII_UPPER"))
+        .value_type("&[&str]")
+        .map()
+        .unicase(false)
+        .write(
+            file,
+            records
+                .iter()
+                .filter(|r| r.iter().next().unwrap().is_ascii())
+                .map(|record| {
+                    use heck::ToShoutySnakeCase;
+                    let mut record_fields = record.iter();
+                    let key = record_fields.next().unwrap().to_shouty_snake_case();
+                    let value = format!(
+                        "&[{}]",
+                        itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
+                    );
+                    (key, value)
+                }),
+        )
+        .unwrap();
+    dictgen::DictGen::new()
+        .name(&format!("{name}_ASCII_TITLE"))
+        .value_type("&[&str]")
+        .map()
+        .unicase(false)
+        .write(
+            file,
+            records
+                .iter()
+                .filter(|r| r.iter().next().unwrap().is_ascii())
+                .map(|record| {
+                    use heck::ToTitleCase;
+                    let mut record_fields = record.iter();
+                    let key = record_fields.next().unwrap().to_title_case();
+                    let value = format!(
+                        "&[{}]",
+                        itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
+                    );
+                    (key, value)
+                }),
+        )
+        .unwrap();
+    dictgen::DictGen::new()
+        .name(&format!("{name}_UNICODE"))
+        .value_type("&[&str]")
+        .ordered_map()
+        .write(
+            file,
+            records
+                .iter()
+                .filter(|r| !r.iter().next().unwrap().is_ascii())
+                .map(|record| {
+                    let mut record_fields = record.iter();
+                    let key = record_fields.next().unwrap();
+                    let value = format!(
+                        "&[{}]",
+                        itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
+                    );
+                    (key, value)
+                }),
+        )
+        .unwrap();
+}
+
 fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
    writeln!(
        file,
--- a/crates/typos-vars/src/vars_codegen.rs
+++ b/crates/typos-vars/src/vars_codegen.rs
--- a/crates/wikipedia-dict/src/dict_codegen.rs
+++ b/crates/wikipedia-dict/src/dict_codegen.rs