Merge pull request #1194 from epage/perf

fix(dictgen)!: Generate phf for map feature
2025-01-25 16:09:03 -05:00 · 2024-12-30 14:04:15 -06:00 · 2024-12-30 14:04:15 -06:00 · b87cd87116
commit b87cd87116
parent 8bdd84c413 a33352a199
9 changed files with 180646 additions and 138185 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -450,6 +450,7 @@ name = "dictgen"
 version = "0.2.11"
 dependencies = [
 "phf",
+ "phf_codegen",
 "phf_shared",
 "unicase",
 ]
@ -884,6 +885,16 @@ dependencies = [
 "phf_shared",
 ]

+[[package]]
+name = "phf_codegen"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
 [[package]]
 name = "phf_generator"
 version = "0.11.2"
@ -1500,6 +1511,7 @@ dependencies = [
 "edit-distance",
 "indexmap",
 "itertools 0.13.0",
+ "phf",
 "snapbox",
 "unicase",
 "varcon",
--- a/crates/dictgen/Cargo.toml
+++ b/crates/dictgen/Cargo.toml
@ -17,13 +17,14 @@ rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
 [features]
 default = ["std"]
 std = []
-codegen = ["std"]
+codegen = ["std", "dep:phf_codegen"]
 map = ["dep:phf", "dep:phf_shared"]

 [dependencies]
 unicase = "2.7"
 phf = { version = "0.11", features = ["unicase"], optional = true }
 phf_shared = { version = "0.11", optional = true }
+phf_codegen = { version = "0.11", optional = true }

 [lints]
 workspace = true
--- a/crates/dictgen/src/insensitive.rs
+++ b/crates/dictgen/src/insensitive.rs
@ -0,0 +1,103 @@
+/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
+#[derive(Copy, Clone)]
+pub enum InsensitiveStr<'s> {
+    Unicode(&'s str),
+    Ascii(&'s str),
+}
+
+impl<'s> InsensitiveStr<'s> {
+    pub fn convert(self) -> unicase::UniCase<&'s str> {
+        match self {
+            InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
+            InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
+        }
+    }
+
+    pub fn into_inner(self) -> &'s str {
+        match self {
+            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
+        }
+    }
+
+    pub fn is_empty(self) -> bool {
+        match self {
+            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.is_empty(),
+        }
+    }
+
+    pub fn len(self) -> usize {
+        match self {
+            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.len(),
+        }
+    }
+}
+
+impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
+    fn from(other: unicase::UniCase<&'s str>) -> Self {
+        if other.is_ascii() {
+            InsensitiveStr::Ascii(other.into_inner())
+        } else {
+            InsensitiveStr::Unicode(other.into_inner())
+        }
+    }
+}
+
+impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
+    #[inline]
+    fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
+        self.convert() == other.convert()
+    }
+}
+
+impl Eq for InsensitiveStr<'_> {}
+
+impl core::hash::Hash for InsensitiveStr<'_> {
+    #[inline]
+    fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
+        self.convert().hash(hasher);
+    }
+}
+
+impl core::fmt::Debug for InsensitiveStr<'_> {
+    #[inline]
+    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        core::fmt::Debug::fmt(self.into_inner(), fmt)
+    }
+}
+
+impl core::fmt::Display for InsensitiveStr<'_> {
+    #[inline]
+    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        core::fmt::Display::fmt(self.into_inner(), fmt)
+    }
+}
+
+#[cfg(feature = "map")]
+impl phf_shared::PhfHash for InsensitiveStr<'_> {
+    #[inline]
+    fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
+        core::hash::Hash::hash(self, state);
+    }
+}
+
+#[cfg(feature = "map")]
+impl phf_shared::FmtConst for InsensitiveStr<'_> {
+    fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self {
+            InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
+            InsensitiveStr::Unicode(_) => {
+                f.write_str("dictgen::InsensitiveStr::Unicode(")?;
+            }
+        }
+
+        self.into_inner().fmt_const(f)?;
+        f.write_str(")")
+    }
+}
+
+#[cfg(feature = "map")]
+impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a> {
+    fn borrow(&self) -> &InsensitiveStr<'b> {
+        self
+    }
+}
--- a/crates/dictgen/src/lib.rs
+++ b/crates/dictgen/src/lib.rs
@ -4,6 +4,7 @@

 #[cfg(feature = "codegen")]
 mod gen;
+mod insensitive;
 #[cfg(feature = "map")]
 mod map;
 mod table;
@ -11,6 +12,7 @@ mod trie;

 #[cfg(feature = "codegen")]
 pub use gen::*;
+pub use insensitive::*;
 #[cfg(feature = "map")]
 pub use map::*;
 pub use table::*;
--- a/crates/dictgen/src/map.rs
+++ b/crates/dictgen/src/map.rs
@ -18,33 +18,35 @@ impl DictMapGen<'_> {

        let mut smallest = usize::MAX;
        let mut largest = usize::MIN;
-
-        writeln!(
-            file,
-            "pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
-        )?;
-        writeln!(file, "    keys: &[")?;
-        for (key, _value) in data.iter() {
+        let mut builder = phf_codegen::Map::new();
+        let data = data
+            .iter()
+            .map(|(key, value)| {
+                (
+                    if key.is_ascii() {
+                        crate::InsensitiveStr::Ascii(key)
+                    } else {
+                        crate::InsensitiveStr::Unicode(key)
+                    },
+                    value.to_string(),
+                )
+            })
+            .collect::<Vec<_>>();
+        for (key, value) in data.iter() {
            smallest = std::cmp::min(smallest, key.len());
            largest = std::cmp::max(largest, key.len());
-
-            let key = if key.is_ascii() {
-                format!("dictgen::InsensitiveStr::Ascii({key:?})")
-            } else {
-                format!("dictgen::InsensitiveStr::Unicode({key:?})")
-            };
-
-            writeln!(file, "      {key},")?;
+            builder.entry(key, value.as_str());
        }
+        let builder = builder.build();
        if largest == 0 {
            smallest = 0;
        }
-        writeln!(file, "    ],")?;
-        writeln!(file, "    values: &[")?;
-        for (_key, value) in data.iter() {
-            writeln!(file, "      {value},")?;
-        }
-        writeln!(file, "    ],")?;
+
+        writeln!(
+            file,
+            "pub static {name}: dictgen::DictMap<{value_type}> = dictgen::DictMap {{"
+        )?;
+        writeln!(file, "    map: {builder},")?;
        writeln!(file, "    range: {smallest}..={largest},")?;
        writeln!(file, "}};")?;

@ -58,6 +60,7 @@ pub struct DictMap<V: 'static> {
 }

 impl<V> DictMap<V> {
+    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
        if self.range.contains(&word.len()) {
            self.map.get(&(*word).into())
@ -65,35 +68,4 @@ impl<V> DictMap<V> {
            None
        }
    }
-
-    pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&str>, &V)> + '_ {
-        self.map.entries().map(|(k, v)| (k.convert(), v))
-    }
-}
-
-impl phf_shared::PhfHash for crate::InsensitiveStr<'_> {
-    #[inline]
-    fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
-        core::hash::Hash::hash(self, state);
-    }
-}
-
-impl phf_shared::FmtConst for crate::InsensitiveStr<'_> {
-    fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        match self {
-            crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
-            crate::InsensitiveStr::Unicode(_) => {
-                f.write_str("dictgen::InsensitiveStr::Unicode(")?;
-            }
-        }
-
-        self.into_inner().fmt_const(f)?;
-        f.write_str(")")
-    }
-}
-
-impl<'b, 'a: 'b> phf_shared::PhfBorrow<crate::InsensitiveStr<'b>> for crate::InsensitiveStr<'a> {
-    fn borrow(&self) -> &crate::InsensitiveStr<'b> {
-        self
-    }
 }
--- a/crates/dictgen/src/table.rs
+++ b/crates/dictgen/src/table.rs
@ -53,12 +53,13 @@ impl DictTableGen<'_> {
 }

 pub struct DictTable<V: 'static> {
-    pub keys: &'static [InsensitiveStr<'static>],
+    pub keys: &'static [crate::InsensitiveStr<'static>],
    pub values: &'static [V],
    pub range: core::ops::RangeInclusive<usize>,
 }

 impl<V> DictTable<V> {
+    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
        if self.range.contains(&word.len()) {
            self.keys
@ -69,70 +70,4 @@ impl<V> DictTable<V> {
            None
        }
    }
-
-    pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> + '_ {
-        (0..self.keys.len()).map(move |i| (self.keys[i].convert(), &self.values[i]))
-    }
-}
-
-/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
-#[derive(Copy, Clone)]
-pub enum InsensitiveStr<'s> {
-    Unicode(&'s str),
-    Ascii(&'s str),
-}
-
-impl<'s> InsensitiveStr<'s> {
-    pub fn convert(self) -> unicase::UniCase<&'s str> {
-        match self {
-            InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
-            InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
-        }
-    }
-
-    pub fn into_inner(self) -> &'s str {
-        match self {
-            InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
-        }
-    }
-}
-
-impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
-    fn from(other: unicase::UniCase<&'s str>) -> Self {
-        if other.is_ascii() {
-            InsensitiveStr::Ascii(other.into_inner())
-        } else {
-            InsensitiveStr::Unicode(other.into_inner())
-        }
-    }
-}
-
-impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
-    #[inline]
-    fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
-        self.convert() == other.convert()
-    }
-}
-
-impl Eq for InsensitiveStr<'_> {}
-
-impl core::hash::Hash for InsensitiveStr<'_> {
-    #[inline]
-    fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
-        self.convert().hash(hasher);
-    }
-}
-
-impl core::fmt::Debug for InsensitiveStr<'_> {
-    #[inline]
-    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        core::fmt::Debug::fmt(self.into_inner(), fmt)
-    }
-}
-
-impl core::fmt::Display for InsensitiveStr<'_> {
-    #[inline]
-    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        core::fmt::Display::fmt(self.into_inner(), fmt)
-    }
 }
--- a/crates/dictgen/src/trie.rs
+++ b/crates/dictgen/src/trie.rs
@ -32,48 +32,50 @@ pub struct DictTrie<V: 'static> {
 }

 impl<V> DictTrie<V> {
+    #[inline]
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
-        if self.range.contains(&word.len()) {
-            let bytes = word.as_bytes();
+        if word.is_ascii() {
+            if self.range.contains(&word.len()) {
+                self.find_ascii(word.as_bytes())
+            } else {
+                None
+            }
+        } else {
+            self.unicode.find(word)
+        }
+    }

-            let mut child = &self.root;
-            for i in 0..bytes.len() {
-                match child.children {
-                    DictTrieChild::Nested(n) => {
-                        let byte = bytes[i];
-                        let index = if byte.is_ascii_lowercase() {
-                            byte - b'a'
-                        } else if byte.is_ascii_uppercase() {
-                            byte - b'A'
-                        } else {
-                            return self.unicode.find(word);
-                        };
-                        debug_assert!(index < 26);
-                        if let Some(next) = n[index as usize].as_ref() {
-                            child = next;
-                        } else {
-                            return None;
-                        }
-                    }
-                    DictTrieChild::Flat(t) => {
-                        let remaining = &bytes[i..bytes.len()];
-                        // Unsafe: Everything before has been proven to be ASCII, so this should be
-                        // safe.
-                        let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
-                        // Reuse the prior ascii check, rather than doing it again
-                        let remaining = if word.is_ascii() {
-                            unicase::UniCase::ascii(remaining)
-                        } else {
-                            unicase::UniCase::unicode(remaining)
-                        };
-                        return t.find(&remaining);
+    fn find_ascii(&self, word: &[u8]) -> Option<&'static V> {
+        let mut child = &self.root;
+        for i in 0..word.len() {
+            match child.children {
+                DictTrieChild::Nested(n) => {
+                    let byte = word[i];
+                    let index = if byte.is_ascii_lowercase() {
+                        byte - b'a'
+                    } else if byte.is_ascii_uppercase() {
+                        byte - b'A'
+                    } else {
+                        return None;
+                    };
+                    debug_assert!(index < 26);
+                    if let Some(next) = n[index as usize].as_ref() {
+                        child = next;
+                    } else {
+                        return None;
                    }
                }
+                DictTrieChild::Flat(t) => {
+                    let remaining = &word[i..word.len()];
+                    // Unsafe: Everything before has been proven to be ASCII, so this should be
+                    // safe.
+                    let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
+                    let remaining = unicase::UniCase::ascii(remaining);
+                    return t.find(&remaining);
+                }
            }
-            child.value.as_ref()
-        } else {
-            None
        }
+        child.value.as_ref()
    }
 }

--- a/crates/typos-dict/Cargo.toml
+++ b/crates/typos-dict/Cargo.toml
@ -29,6 +29,7 @@ varcon = { version = "^1.0", path = "../varcon" }
 snapbox = "0.6.5"
 indexmap = "2.2.6"
 divan = "0.1.16"
+phf = "0.11.2"

 [lints]
 workspace = true
--- a/crates/typos-dict/benches/benches/map_codegen.rs
+++ b/crates/typos-dict/benches/benches/map_codegen.rs