refactor(dict): Separate dictgen concerns

2024-11-28 12:01:06 -05:00 · 2021-07-01 08:52:36 -05:00 · 2021-07-01 08:52:36 -05:00 · 3b43272724
commit 3b43272724
parent 97015b3a95
8 changed files with 233 additions and 208 deletions
--- a/crates/codespell-dict/codegen/Cargo.toml
+++ b/crates/codespell-dict/codegen/Cargo.toml
@ -22,4 +22,4 @@ unicase = "2.5"
 itertools = "0.10"
 codegenrs = "1.0"
 structopt = "0.3"
-dictgen = { version = "0.1", path = "../../dictgen" }
+dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
--- a/crates/dictgen/Cargo.toml
+++ b/crates/dictgen/Cargo.toml
@ -1,7 +1,17 @@
 [package]
 name = "dictgen"
 version = "0.1.0"
 description = "Compile-time case-insensitive map"
 repository = "https://github.com/crate-ci/typos"
 categories = ["development-tools", "text-processing"]
 keywords = ["development", "spelling", "no_std"]
 license = "MIT"
 edition = "2018"
 [features]
 default = ["std", "codegen"]
 std = []
 codegen = ["std"]
 [dependencies]
 unicase = "2.5"
--- a/crates/dictgen/src/table.rs
+++ b/crates/dictgen/src/table.rs
@ -1,3 +1,4 @@
 #[cfg(feature = "codegen")]
 pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
    file: &mut W,
    name: &str,
@ -46,7 +47,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
 pub struct DictTable<V: 'static> {
    pub keys: &'static [InsensitiveStr],
    pub values: &'static [V],
-    pub range: std::ops::RangeInclusive<usize>,
+    pub range: core::ops::RangeInclusive<usize>,
 }
 impl<V> DictTable<V> {
--- a/crates/dictgen/src/trie.rs
+++ b/crates/dictgen/src/trie.rs
@ -1,12 +1,87 @@
 /// # Panics
 ///
 /// - On duplicate entry
 #[cfg(feature = "codegen")]
 pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
    file: &mut W,
    prefix: &str,
    value_type: &str,
    data: impl Iterator<Item = (&'d str, V)>,
    limit: usize,
 ) -> Result<(), std::io::Error> {
    codegen::generate_trie(file, prefix, value_type, data, limit)
 }
 pub struct DictTrie<V: 'static> {
    pub root: &'static DictTrieNode<V>,
    pub unicode: &'static crate::DictTable<V>,
    pub range: core::ops::RangeInclusive<usize>,
 }
 impl<V> DictTrie<V> {
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
        if self.range.contains(&word.len()) {
            let bytes = word.as_bytes();
            let mut child = &self.root;
            for i in 0..bytes.len() {
                match child.children {
                    DictTrieChild::Nested(n) => {
                        let byte = bytes[i];
                        let index = if (b'a'..b'z').contains(&byte) {
                            byte - b'a'
                        } else if (b'A'..b'Z').contains(&byte) {
                            byte - b'A'
                        } else {
                            return self.unicode.find(word);
                        };
                        debug_assert!(index < 26);
                        if let Some(next) = n[index as usize].as_ref() {
                            child = next;
                        } else {
                            return None;
                        }
                    }
                    DictTrieChild::Flat(t) => {
                        let remaining = &bytes[i..bytes.len()];
                        // Unsafe: Everything before has been proven to be ASCII, so this should be
                        // safe.
                        let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
                        // Reuse the prior ascii check, rather than doing it again
                        let remaining = if word.is_ascii() {
                            unicase::UniCase::ascii(remaining)
                        } else {
                            unicase::UniCase::unicode(remaining)
                        };
                        return t.find(&remaining);
                    }
                }
            }
            child.value.as_ref()
        } else {
            None
        }
    }
 }
 pub struct DictTrieNode<V: 'static> {
    pub children: DictTrieChild<V>,
    pub value: Option<V>,
 }
 pub enum DictTrieChild<V: 'static> {
    Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
    Flat(&'static crate::DictTable<V>),
 }
 #[cfg(feature = "codegen")]
 mod codegen {
    pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
        file: &mut W,
        prefix: &str,
        value_type: &str,
        data: impl Iterator<Item = (&'d str, V)>,
        limit: usize,
    ) -> Result<(), std::io::Error> {
        let mut root = DynRoot::new(data);
        root.burst(limit);
@ -121,68 +196,6 @@ fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
        }
    }
 pub struct DictTrie<V: 'static> {
    pub root: &'static DictTrieNode<V>,
    pub unicode: &'static crate::DictTable<V>,
    pub range: std::ops::RangeInclusive<usize>,
 }
 impl<V> DictTrie<V> {
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
        if self.range.contains(&word.len()) {
            let bytes = word.as_bytes();
            let mut child = &self.root;
            for i in 0..bytes.len() {
                match child.children {
                    DictTrieChild::Nested(n) => {
                        let byte = bytes[i];
                        let index = if (b'a'..b'z').contains(&byte) {
                            byte - b'a'
                        } else if (b'A'..b'Z').contains(&byte) {
                            byte - b'A'
                        } else {
                            return self.unicode.find(word);
                        };
                        debug_assert!(index < 26);
                        if let Some(next) = n[index as usize].as_ref() {
                            child = next;
                        } else {
                            return None;
                        }
                    }
                    DictTrieChild::Flat(t) => {
                        let remaining = &bytes[i..bytes.len()];
                        // Unsafe: Everything before has been proven to be ASCII, so this should be
                        // safe.
                        let remaining = unsafe { std::str::from_utf8_unchecked(remaining) };
                        // Reuse the prior ascii check, rather than doing it again
                        let remaining = if word.is_ascii() {
                            unicase::UniCase::ascii(remaining)
                        } else {
                            unicase::UniCase::unicode(remaining)
                        };
                        return t.find(&remaining);
                    }
                }
            }
            child.value.as_ref()
        } else {
            None
        }
    }
 }
 pub struct DictTrieNode<V: 'static> {
    pub children: DictTrieChild<V>,
    pub value: Option<V>,
 }
 pub enum DictTrieChild<V: 'static> {
    Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
    Flat(&'static crate::DictTable<V>),
 }
    struct DynRoot<'s, V> {
        root: DynNode<'s, V>,
        unicode: Vec<(&'s str, V)>,
@ -288,3 +301,4 @@ impl<'s, V> DynChild<'s, V> {
            }
        }
    }
 }
--- a/crates/misspell-dict/codegen/Cargo.toml
+++ b/crates/misspell-dict/codegen/Cargo.toml
@ -23,4 +23,4 @@ itertools = "0.10"
 codegenrs = "1.0"
 structopt = "0.3"
 regex = "1"
-dictgen = { version = "0.1", path = "../../dictgen" }
+dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
--- a/crates/typos-dict/codegen/Cargo.toml
+++ b/crates/typos-dict/codegen/Cargo.toml
@ -23,4 +23,4 @@ itertools = "0.10"
 unicase = "2.5"
 codegenrs = "1.0"
 structopt = "0.3"
-dictgen = { version = "0.1", path = "../../dictgen" }
+dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
--- a/crates/typos-vars/codegen/Cargo.toml
+++ b/crates/typos-vars/codegen/Cargo.toml
@ -29,4 +29,4 @@ log = "0.4"
 env_logger = "0.7"
 clap-verbosity-flag = "0.3"
 itertools = "0.10"
-dictgen = { version = "0.1", path = "../../dictgen" }
+dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
--- a/crates/wikipedia-dict/codegen/Cargo.toml
+++ b/crates/wikipedia-dict/codegen/Cargo.toml
@ -22,4 +22,4 @@ unicase = "2.5"
 itertools = "0.10"
 codegenrs = "1.0"
 structopt = "0.3"
-dictgen = { version = "0.1", path = "../../dictgen" }
+dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }