Merge pull request #1194 from epage/perf

fix(dictgen)!: Generate phf for map feature
2025-01-26 16:39:07 -05:00 · 2024-12-30 14:04:15 -06:00 · 2024-12-30 14:04:15 -06:00 · b87cd87116
commit b87cd87116
parent 8bdd84c413 a33352a199
9 changed files with 180646 additions and 138185 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -450,6 +450,7 @@ name = "dictgen"
 version = "0.2.11"
 dependencies = [
 "phf",
 "phf_codegen",
 "phf_shared",
 "unicase",
 ]
@ -884,6 +885,16 @@ dependencies = [
 "phf_shared",
 ]
 [[package]]
 name = "phf_codegen"
 version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
 dependencies = [
 "phf_generator",
 "phf_shared",
 ]
 [[package]]
 name = "phf_generator"
 version = "0.11.2"
@ -1500,6 +1511,7 @@ dependencies = [
 "edit-distance",
 "indexmap",
 "itertools 0.13.0",
 "phf",
 "snapbox",
 "unicase",
 "varcon",
--- a/crates/dictgen/Cargo.toml
+++ b/crates/dictgen/Cargo.toml
@ -17,13 +17,14 @@ rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
 [features]
 default = ["std"]
 std = []
-codegen = ["std"]
+codegen = ["std", "dep:phf_codegen"]
 map = ["dep:phf", "dep:phf_shared"]
 [dependencies]
 unicase = "2.7"
 phf = { version = "0.11", features = ["unicase"], optional = true }
 phf_shared = { version = "0.11", optional = true }
 phf_codegen = { version = "0.11", optional = true }
 [lints]
 workspace = true
--- a/crates/dictgen/src/insensitive.rs
+++ b/crates/dictgen/src/insensitive.rs
@ -0,0 +1,103 @@
 /// `UniCase` look-alike that avoids const-fn so large tables don't OOM
 #[derive(Copy, Clone)]
 pub enum InsensitiveStr<'s> {
    Unicode(&'s str),
    Ascii(&'s str),
 }
 impl<'s> InsensitiveStr<'s> {
    pub fn convert(self) -> unicase::UniCase<&'s str> {
        match self {
            InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
            InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
        }
    }
    pub fn into_inner(self) -> &'s str {
        match self {
            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
        }
    }
    pub fn is_empty(self) -> bool {
        match self {
            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.is_empty(),
        }
    }
    pub fn len(self) -> usize {
        match self {
            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.len(),
        }
    }
 }
 impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
    fn from(other: unicase::UniCase<&'s str>) -> Self {
        if other.is_ascii() {
            InsensitiveStr::Ascii(other.into_inner())
        } else {
            InsensitiveStr::Unicode(other.into_inner())
        }
    }
 }
 impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
    #[inline]
    fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
        self.convert() == other.convert()
    }
 }
 impl Eq for InsensitiveStr<'_> {}
 impl core::hash::Hash for InsensitiveStr<'_> {
    #[inline]
    fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
        self.convert().hash(hasher);
    }
 }
 impl core::fmt::Debug for InsensitiveStr<'_> {
    #[inline]
    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        core::fmt::Debug::fmt(self.into_inner(), fmt)
    }
 }
 impl core::fmt::Display for InsensitiveStr<'_> {
    #[inline]
    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        core::fmt::Display::fmt(self.into_inner(), fmt)
    }
 }
 #[cfg(feature = "map")]
 impl phf_shared::PhfHash for InsensitiveStr<'_> {
    #[inline]
    fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
        core::hash::Hash::hash(self, state);
    }
 }
 #[cfg(feature = "map")]
 impl phf_shared::FmtConst for InsensitiveStr<'_> {
    fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
            InsensitiveStr::Unicode(_) => {
                f.write_str("dictgen::InsensitiveStr::Unicode(")?;
            }
        }
        self.into_inner().fmt_const(f)?;
        f.write_str(")")
    }
 }
 #[cfg(feature = "map")]
 impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a> {
    fn borrow(&self) -> &InsensitiveStr<'b> {
        self
    }
 }
--- a/crates/dictgen/src/lib.rs
+++ b/crates/dictgen/src/lib.rs
@ -4,6 +4,7 @@
 #[cfg(feature = "codegen")]
 mod gen;
 mod insensitive;
 #[cfg(feature = "map")]
 mod map;
 mod table;
@ -11,6 +12,7 @@ mod trie;
 #[cfg(feature = "codegen")]
 pub use gen::*;
 pub use insensitive::*;
 #[cfg(feature = "map")]
 pub use map::*;
 pub use table::*;
--- a/crates/dictgen/src/map.rs
+++ b/crates/dictgen/src/map.rs
@ -18,33 +18,35 @@ impl DictMapGen<'_> {
        let mut smallest = usize::MAX;
        let mut largest = usize::MIN;
-
+        let mut builder = phf_codegen::Map::new();
-        writeln!(
+        let data = data
-            file,
+            .iter()
-            "pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
+            .map(|(key, value)| {
-        )?;
+                (
-        writeln!(file, "    keys: &[")?;
+                    if key.is_ascii() {
-        for (key, _value) in data.iter() {
+                        crate::InsensitiveStr::Ascii(key)
                    } else {
                        crate::InsensitiveStr::Unicode(key)
                    },
                    value.to_string(),
                )
            })
            .collect::<Vec<_>>();
        for (key, value) in data.iter() {
            smallest = std::cmp::min(smallest, key.len());
            largest = std::cmp::max(largest, key.len());
-
+            builder.entry(key, value.as_str());
            let key = if key.is_ascii() {
                format!("dictgen::InsensitiveStr::Ascii({key:?})")
            } else {
                format!("dictgen::InsensitiveStr::Unicode({key:?})")
            };
            writeln!(file, "      {key},")?;
        }
        let builder = builder.build();
        if largest == 0 {
            smallest = 0;
        }
-        writeln!(file, "    ],")?;
+
-        writeln!(file, "    values: &[")?;
+        writeln!(
-        for (_key, value) in data.iter() {
+            file,
-            writeln!(file, "      {value},")?;
+            "pub static {name}: dictgen::DictMap<{value_type}> = dictgen::DictMap {{"
-        }
+        )?;
-        writeln!(file, "    ],")?;
+        writeln!(file, "    map: {builder},")?;
        writeln!(file, "    range: {smallest}..={largest},")?;
        writeln!(file, "}};")?;
@ -58,6 +60,7 @@ pub struct DictMap<V: 'static> {
 }
 impl<V> DictMap<V> {
    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
        if self.range.contains(&word.len()) {
            self.map.get(&(*word).into())
@ -65,35 +68,4 @@ impl<V> DictMap<V> {
            None
        }
    }
    pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&str>, &V)> + '_ {
        self.map.entries().map(|(k, v)| (k.convert(), v))
    }
 }
 impl phf_shared::PhfHash for crate::InsensitiveStr<'_> {
    #[inline]
    fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
        core::hash::Hash::hash(self, state);
    }
 }
 impl phf_shared::FmtConst for crate::InsensitiveStr<'_> {
    fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
            crate::InsensitiveStr::Unicode(_) => {
                f.write_str("dictgen::InsensitiveStr::Unicode(")?;
            }
        }
        self.into_inner().fmt_const(f)?;
        f.write_str(")")
    }
 }
 impl<'b, 'a: 'b> phf_shared::PhfBorrow<crate::InsensitiveStr<'b>> for crate::InsensitiveStr<'a> {
    fn borrow(&self) -> &crate::InsensitiveStr<'b> {
        self
    }
 }
--- a/crates/dictgen/src/table.rs
+++ b/crates/dictgen/src/table.rs
@ -53,12 +53,13 @@ impl DictTableGen<'_> {
 }
 pub struct DictTable<V: 'static> {
-    pub keys: &'static [InsensitiveStr<'static>],
+    pub keys: &'static [crate::InsensitiveStr<'static>],
    pub values: &'static [V],
    pub range: core::ops::RangeInclusive<usize>,
 }
 impl<V> DictTable<V> {
    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
        if self.range.contains(&word.len()) {
            self.keys
@ -69,70 +70,4 @@ impl<V> DictTable<V> {
            None
        }
    }
    pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> + '_ {
        (0..self.keys.len()).map(move |i| (self.keys[i].convert(), &self.values[i]))
    }
 }
 /// `UniCase` look-alike that avoids const-fn so large tables don't OOM
 #[derive(Copy, Clone)]
 pub enum InsensitiveStr<'s> {
    Unicode(&'s str),
    Ascii(&'s str),
 }
 impl<'s> InsensitiveStr<'s> {
    pub fn convert(self) -> unicase::UniCase<&'s str> {
        match self {
            InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
            InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
        }
    }
    pub fn into_inner(self) -> &'s str {
        match self {
            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
        }
    }
 }
 impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
    fn from(other: unicase::UniCase<&'s str>) -> Self {
        if other.is_ascii() {
            InsensitiveStr::Ascii(other.into_inner())
        } else {
            InsensitiveStr::Unicode(other.into_inner())
        }
    }
 }
 impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
    #[inline]
    fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
        self.convert() == other.convert()
    }
 }
 impl Eq for InsensitiveStr<'_> {}
 impl core::hash::Hash for InsensitiveStr<'_> {
    #[inline]
    fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
        self.convert().hash(hasher);
    }
 }
 impl core::fmt::Debug for InsensitiveStr<'_> {
    #[inline]
    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        core::fmt::Debug::fmt(self.into_inner(), fmt)
    }
 }
 impl core::fmt::Display for InsensitiveStr<'_> {
    #[inline]
    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        core::fmt::Display::fmt(self.into_inner(), fmt)
    }
 }
--- a/crates/dictgen/src/trie.rs
+++ b/crates/dictgen/src/trie.rs
@ -32,48 +32,50 @@ pub struct DictTrie<V: 'static> {
 }
 impl<V> DictTrie<V> {
    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
-        if self.range.contains(&word.len()) {
+        if word.is_ascii() {
-            let bytes = word.as_bytes();
+            if self.range.contains(&word.len()) {
                self.find_ascii(word.as_bytes())
            } else {
                None
            }
        } else {
            self.unicode.find(word)
        }
    }
-            let mut child = &self.root;
+    fn find_ascii(&self, word: &[u8]) -> Option<&'static V> {
-            for i in 0..bytes.len() {
+        let mut child = &self.root;
-                match child.children {
+        for i in 0..word.len() {
-                    DictTrieChild::Nested(n) => {
+            match child.children {
-                        let byte = bytes[i];
+                DictTrieChild::Nested(n) => {
-                        let index = if byte.is_ascii_lowercase() {
+                    let byte = word[i];
-                            byte - b'a'
+                    let index = if byte.is_ascii_lowercase() {
-                        } else if byte.is_ascii_uppercase() {
+                        byte - b'a'
-                            byte - b'A'
+                    } else if byte.is_ascii_uppercase() {
-                        } else {
+                        byte - b'A'
-                            return self.unicode.find(word);
+                    } else {
-                        };
+                        return None;
-                        debug_assert!(index < 26);
+                    };
-                        if let Some(next) = n[index as usize].as_ref() {
+                    debug_assert!(index < 26);
-                            child = next;
+                    if let Some(next) = n[index as usize].as_ref() {
-                        } else {
+                        child = next;
-                            return None;
+                    } else {
-                        }
+                        return None;
                    }
                    DictTrieChild::Flat(t) => {
                        let remaining = &bytes[i..bytes.len()];
                        // Unsafe: Everything before has been proven to be ASCII, so this should be
                        // safe.
                        let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
                        // Reuse the prior ascii check, rather than doing it again
                        let remaining = if word.is_ascii() {
                            unicase::UniCase::ascii(remaining)
                        } else {
                            unicase::UniCase::unicode(remaining)
                        };
                        return t.find(&remaining);
                    }
                }
                DictTrieChild::Flat(t) => {
                    let remaining = &word[i..word.len()];
                    // Unsafe: Everything before has been proven to be ASCII, so this should be
                    // safe.
                    let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
                    let remaining = unicase::UniCase::ascii(remaining);
                    return t.find(&remaining);
                }
            }
            child.value.as_ref()
        } else {
            None
        }
        child.value.as_ref()
    }
 }
--- a/crates/typos-dict/Cargo.toml
+++ b/crates/typos-dict/Cargo.toml
@ -29,6 +29,7 @@ varcon = { version = "^1.0", path = "../varcon" }
 snapbox = "0.6.5"
 indexmap = "2.2.6"
 divan = "0.1.16"
 phf = "0.11.2"
 [lints]
 workspace = true
--- a/crates/typos-dict/benches/benches/map_codegen.rs
+++ b/crates/typos-dict/benches/benches/map_codegen.rs