refactor(dict): Pull out table-lookup logic

Before, only some dicts did we guarentee were pre-sorted. Now, all are for-sure pre-sorted. This also gives each dict the size-check to avoid lookup. But this is really about refactoring in prep for playing with other lookup options, like tries.
2024-11-22 00:51:11 -05:00 · 2021-06-30 10:12:17 -05:00 · 2021-06-30 10:12:17 -05:00 · a1e95bc7c0
commit a1e95bc7c0
parent bfa7888f82
22 changed files with 273300 additions and 174464 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -223,6 +223,7 @@ name = "codespell-codegen"
 version = "0.4.0"
 dependencies = [
 "codegenrs",
 "dictgen",
 "itertools 0.10.0",
 "structopt",
 "unicase",
@ -232,7 +233,7 @@ dependencies = [
 name = "codespell-dict"
 version = "0.4.0"
 dependencies = [
- "log",
+ "dictgen",
 "unicase",
 ]
@ -412,6 +413,13 @@ dependencies = [
 "syn",
 ]
 [[package]]
 name = "dictgen"
 version = "0.1.0"
 dependencies = [
 "unicase",
 ]
 [[package]]
 name = "difference"
 version = "2.0.0"
@ -806,6 +814,7 @@ name = "misspell-codegen"
 version = "0.4.0"
 dependencies = [
 "codegenrs",
 "dictgen",
 "itertools 0.10.0",
 "regex",
 "structopt",
@ -816,6 +825,7 @@ dependencies = [
 name = "misspell-dict"
 version = "0.4.0"
 dependencies = [
 "dictgen",
 "log",
 "unicase",
 ]
@ -1573,6 +1583,7 @@ dependencies = [
 name = "typos-vars"
 version = "0.6.0"
 dependencies = [
 "dictgen",
 "log",
 "unicase",
 "varcon-core",
@ -1585,6 +1596,7 @@ dependencies = [
 "clap",
 "clap-verbosity-flag",
 "codegenrs",
 "dictgen",
 "env_logger 0.7.1",
 "itertools 0.10.0",
 "log",
@ -1774,6 +1786,7 @@ name = "wikipedia-codegen"
 version = "0.4.0"
 dependencies = [
 "codegenrs",
 "dictgen",
 "itertools 0.10.0",
 "structopt",
 "unicase",
@ -1783,7 +1796,7 @@ dependencies = [
 name = "wikipedia-dict"
 version = "0.4.0"
 dependencies = [
- "log",
+ "dictgen",
 "unicase",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,6 +3,7 @@ members = [
    "crates/typos",
    "crates/typos-dict", "crates/typos-dict/codegen", "crates/typos-dict/verify",
    "crates/typos-vars", "crates/typos-vars/codegen",
    "crates/dictgen",
    "crates/codespell-dict", "crates/codespell-dict/codegen",
    "crates/misspell-dict", "crates/misspell-dict/codegen",
    "crates/wikipedia-dict", "crates/wikipedia-dict/codegen",
--- a/crates/codespell-dict/Cargo.toml
+++ b/crates/codespell-dict/Cargo.toml
@ -20,4 +20,4 @@ disable-release = true
 [dependencies]
 unicase = "2.5"
-log = "0.4"
+dictgen = { version = "0.1", path = "../dictgen" }
--- a/crates/codespell-dict/codegen/Cargo.toml
+++ b/crates/codespell-dict/codegen/Cargo.toml
@ -22,3 +22,4 @@ unicase = "2.5"
 itertools = "0.10"
 codegenrs = "1.0"
 structopt = "0.3"
 dictgen = { version = "0.1", path = "../../dictgen" }
--- a/crates/codespell-dict/codegen/src/main.rs
+++ b/crates/codespell-dict/codegen/src/main.rs
@ -30,20 +30,17 @@ fn generate<W: std::io::Write>(file: &mut W) {
        env!("CARGO_PKG_NAME")
    )
    .unwrap();
    writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
    writeln!(file).unwrap();
    let dict = parse_dict(DICT);
-    writeln!(file, "pub static WORD_DICTIONARY: &[(&str, &[&str])] = &[").unwrap();
+    dictgen::generate_table(
-    for (typo, corrections) in dict {
+        file,
-        let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
+        "WORD_DICTIONARY",
-        let value = format!("&[{}]", value);
+        "&[&str]",
-
+        dict.map(|kv| (kv.0, format!("&{:?}", kv.1))),
-        let key = format!("{:?}", typo);
+    )
-        writeln!(file, "  ({}, {}),", key, &value).unwrap();
+    .unwrap();
    }
    writeln!(file, "];").unwrap();
 }
 #[derive(Debug, StructOpt)]
--- a/crates/codespell-dict/src/dict_codegen.rs
+++ b/crates/codespell-dict/src/dict_codegen.rs
--- a/crates/dictgen/Cargo.toml
+++ b/crates/dictgen/Cargo.toml
@ -0,0 +1,7 @@
 [package]
 name = "dictgen"
 version = "0.1.0"
 edition = "2018"
 [dependencies]
 unicase = "2.5"
--- a/crates/dictgen/src/lib.rs
+++ b/crates/dictgen/src/lib.rs
@ -0,0 +1,74 @@
 pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
    file: &mut W,
    name: &str,
    value_type: &str,
    data: impl Iterator<Item = (&'d str, V)>,
 ) -> Result<(), std::io::Error> {
    let mut data: Vec<_> = data.collect();
    data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
    let mut smallest = usize::MAX;
    let mut largest = usize::MIN;
    writeln!(
        file,
        "pub static {}: dictgen::DictTable<{}> = dictgen::DictTable {{",
        name, value_type
    )?;
    writeln!(file, "    table: &[")?;
    for (key, value) in data {
        smallest = std::cmp::min(smallest, key.len());
        largest = std::cmp::max(largest, key.len());
        let key = if key.is_ascii() {
            format!("dictgen::InsensitiveStr::Ascii({:?})", key)
        } else {
            format!("dictgen::InsensitiveStr::Unicode({:?})", key)
        };
        writeln!(file, "      ({}, {}),", key, value)?;
    }
    writeln!(file, "   ],")?;
    writeln!(file, "   range: {}..={},", smallest, largest)?;
    writeln!(file, "}};")?;
    Ok(())
 }
 pub struct DictTable<V: 'static> {
    pub table: &'static [(InsensitiveStr, V)],
    pub range: std::ops::RangeInclusive<usize>,
 }
 impl<V> DictTable<V> {
    pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
        if self.range.contains(&word.len()) {
            self.table
                .binary_search_by_key(word, |(key, _)| key.convert())
                .map(|i| &self.table[i].1)
                .ok()
        } else {
            None
        }
    }
    pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> {
        self.table.iter().map(|row| (row.0.convert(), &row.1))
    }
 }
 // Avoid unicase's use of const-fn so large tables don't OOM
 #[derive(Copy, Clone, Debug)]
 pub enum InsensitiveStr {
    Unicode(&'static str),
    Ascii(&'static str),
 }
 impl InsensitiveStr {
    fn convert(self) -> unicase::UniCase<&'static str> {
        match self {
            InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
            InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
        }
    }
 }
--- a/crates/misspell-dict/Cargo.toml
+++ b/crates/misspell-dict/Cargo.toml
@ -21,3 +21,4 @@ disable-release = true
 [dependencies]
 unicase = "2.5"
 log = "0.4"
 dictgen = { version = "0.1", path = "../dictgen" }
--- a/crates/misspell-dict/codegen/Cargo.toml
+++ b/crates/misspell-dict/codegen/Cargo.toml
@ -23,3 +23,4 @@ itertools = "0.10"
 codegenrs = "1.0"
 structopt = "0.3"
 regex = "1"
 dictgen = { version = "0.1", path = "../../dictgen" }
--- a/crates/misspell-dict/codegen/src/main.rs
+++ b/crates/misspell-dict/codegen/src/main.rs
@ -62,7 +62,6 @@ fn generate<W: std::io::Write>(file: &mut W) {
        env!("CARGO_PKG_NAME")
    )
    .unwrap();
    writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
    writeln!(file).unwrap();
    let Words {
@ -70,64 +69,32 @@ fn generate<W: std::io::Write>(file: &mut W) {
        american,
        british,
    } = parse_dict(DICT);
    let mut main: Vec<_> = main.into_iter().collect();
    main.sort_unstable_by(|a, b| {
        unicase::UniCase::new(a.0)
            .partial_cmp(&unicase::UniCase::new(b.0))
            .unwrap()
    });
    let mut american: Vec<_> = american.into_iter().collect();
    american.sort_unstable_by(|a, b| {
        unicase::UniCase::new(a.0)
            .partial_cmp(&unicase::UniCase::new(b.0))
            .unwrap()
    });
    let mut british: Vec<_> = british.into_iter().collect();
    british.sort_unstable_by(|a, b| {
        unicase::UniCase::new(a.0)
            .partial_cmp(&unicase::UniCase::new(b.0))
            .unwrap()
    });
-    writeln!(file, "pub static MAIN_DICTIONARY: &[(&str, &[&str])] = &[").unwrap();
+    dictgen::generate_table(
    for (typo, corrections) in main.into_iter() {
        let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
        let value = format!("&[{}]", value);
        let key = format!("{:?}", typo);
        writeln!(file, "  ({}, {}),", key, &value).unwrap();
    }
    writeln!(file, "];").unwrap();
    writeln!(file).unwrap();
    writeln!(
        file,
-        "pub static AMERICAN_DICTIONARY: &[(&str, &[&str])] = &["
+        "MAIN_DICTIONARY",
        "&[&str]",
        main.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))),
    )
    .unwrap();
    for (typo, corrections) in american.into_iter() {
        let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
        let value = format!("&[{}]", value);
-        let key = format!("{:?}", typo);
+    dictgen::generate_table(
        writeln!(file, "  ({}, {}),", key, &value).unwrap();
    }
    writeln!(file, "];").unwrap();
    writeln!(file).unwrap();
    writeln!(
        file,
-        "pub static BRITISH_DICTIONARY: &[(&str, &[&str])] = &["
+        "AMERICAN_DICTIONARY",
        "&[&str]",
        american
            .into_iter()
            .map(|kv| (kv.0, format!("&{:?}", kv.1))),
    )
    .unwrap();
    for (typo, corrections) in british.into_iter() {
        let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
        let value = format!("&[{}]", value);
-        let key = format!("{:?}", typo);
+    dictgen::generate_table(
-        writeln!(file, "  ({}, {}),", key, &value).unwrap();
+        file,
-    }
+        "BRITISH_DICTIONARY",
-    writeln!(file, "];").unwrap();
+        "&[&str]",
        british.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))),
    )
    .unwrap();
 }
 #[derive(Debug, StructOpt)]
--- a/crates/misspell-dict/src/dict_codegen.rs
+++ b/crates/misspell-dict/src/dict_codegen.rs
--- a/crates/typos-vars/Cargo.toml
+++ b/crates/typos-vars/Cargo.toml
@ -17,4 +17,5 @@ codecov = { repository = "crate-ci/typos" }
 [dependencies]
 unicase = "2.5"
 log = "0.4"
 dictgen = { version = "0.1", path = "../dictgen" }
 varcon-core = { version = "^2.0", path = "../varcon-core", features = ["flags"] }
--- a/crates/typos-vars/codegen/Cargo.toml
+++ b/crates/typos-vars/codegen/Cargo.toml
@ -29,3 +29,4 @@ log = "0.4"
 env_logger = "0.7"
 clap-verbosity-flag = "0.3"
 itertools = "0.10"
 dictgen = { version = "0.1", path = "../../dictgen" }
--- a/crates/typos-vars/codegen/src/main.rs
+++ b/crates/typos-vars/codegen/src/main.rs
@ -73,46 +73,27 @@ fn generate_variations<W: std::io::Write>(file: &mut W) {
    writeln!(file, "}}").unwrap();
    writeln!(file).unwrap();
    let mut smallest = usize::MAX;
    let mut largest = usize::MIN;
    let mut no_invalid = true;
    writeln!(
        file,
        "pub(crate) static VARS_DICTIONARY: &[(crate::EncodedStr, &[(u8, &VariantsMap)])] = &["
    )
    .unwrap();
    let entry_sets = entry_sets(entries.iter());
    let mut referenced_symbols: HashSet<&str> = HashSet::new();
-    for (word, data) in entry_sets.iter() {
+    dictgen::generate_table(
        if is_always_valid(data) {
            // No need to convert from current form to target form
            continue;
        }
        referenced_symbols.extend(data.iter().map(|(s, _)| s));
        let value = generate_link(&data);
        let word = unicase::UniCase::new(word);
        let key = if word.is_ascii() {
            format!("crate::EncodedStr::Ascii({:?})", word)
        } else {
            format!("crate::EncodedStr::Unicode({:?})", word)
        };
        writeln!(file, "  ({}, {}),", key, &value).unwrap();
        smallest = std::cmp::min(smallest, word.len());
        largest = std::cmp::max(largest, word.len());
        no_invalid &= !is_always_invalid(data);
    }
    writeln!(file, "];").unwrap();
    writeln!(file).unwrap();
    writeln!(
        file,
-        "pub const WORD_RANGE: std::ops::RangeInclusive<usize> = {}..={};",
+        "VARS_DICTIONARY",
-        smallest, largest
+        "&[(u8, &VariantsMap)]",
        entry_sets.iter().flat_map(|kv| {
            let (word, data) = kv;
            if is_always_valid(data) {
                // No need to convert from current form to target form
                None
            } else {
                referenced_symbols.extend(data.iter().map(|(s, _)| s));
                let value = generate_link(&data);
                Some((*word, value))
            }
        }),
    )
    .unwrap();
    let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data));
    writeln!(file).unwrap();
    writeln!(file, "pub const NO_INVALID: bool = {:?};", no_invalid,).unwrap();
--- a/crates/typos-vars/src/lib.rs
+++ b/crates/typos-vars/src/lib.rs
@ -4,25 +4,3 @@ pub use crate::vars_codegen::*;
 pub use varcon_core::Category;
 pub use varcon_core::CategorySet;
 pub fn find(word: &'_ unicase::UniCase<&str>) -> Option<&'static [(u8, &'static VariantsMap)]> {
    VARS_DICTIONARY
        .binary_search_by_key(word, |(key, _)| key.convert())
        .map(|i| VARS_DICTIONARY[i].1)
        .ok()
 }
 #[derive(Copy, Clone, Debug)]
 pub(crate) enum EncodedStr {
    //Unicode(&'static str),
    Ascii(&'static str),
 }
 impl EncodedStr {
    fn convert(self) -> unicase::UniCase<&'static str> {
        match self {
            //EncodedStr::Unicode(s) => unicase::UniCase::unicode(s),
            EncodedStr::Ascii(s) => unicase::UniCase::ascii(s),
        }
    }
 }
--- a/crates/typos-vars/src/vars_codegen.rs
+++ b/crates/typos-vars/src/vars_codegen.rs
--- a/crates/wikipedia-dict/Cargo.toml
+++ b/crates/wikipedia-dict/Cargo.toml
@ -20,4 +20,4 @@ disable-release = true
 [dependencies]
 unicase = "2.5"
-log = "0.4"
+dictgen = { version = "0.1", path = "../dictgen" }
--- a/crates/wikipedia-dict/codegen/Cargo.toml
+++ b/crates/wikipedia-dict/codegen/Cargo.toml
@ -22,3 +22,4 @@ unicase = "2.5"
 itertools = "0.10"
 codegenrs = "1.0"
 structopt = "0.3"
 dictgen = { version = "0.1", path = "../../dictgen" }
--- a/crates/wikipedia-dict/codegen/src/main.rs
+++ b/crates/wikipedia-dict/codegen/src/main.rs
@ -30,20 +30,17 @@ fn generate<W: std::io::Write>(file: &mut W) {
        env!("CARGO_PKG_NAME")
    )
    .unwrap();
    writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
    writeln!(file).unwrap();
    let dict = parse_dict(DICT);
-    writeln!(file, "pub static WORD_DICTIONARY: &[(&str, &[&str])] = &[").unwrap();
+    dictgen::generate_table(
-    for (typo, corrections) in dict {
+        file,
-        let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
+        "WORD_DICTIONARY",
-        let value = format!("&[{}]", value);
+        "&[&str]",
-
+        dict.map(|kv| (kv.0, format!("&{:?}", kv.1))),
-        let key = format!("{:?}", typo);
+    )
-        writeln!(file, "  ({}, {}),", key, &value).unwrap();
+    .unwrap();
    }
    writeln!(file, "];").unwrap();
 }
 #[derive(Debug, StructOpt)]
--- a/crates/wikipedia-dict/src/dict_codegen.rs
+++ b/crates/wikipedia-dict/src/dict_codegen.rs
--- a/src/dict.rs
+++ b/src/dict.rs
@ -34,14 +34,15 @@ impl BuiltIn {
        }
        let word = word_token.token();
-        let mut corrections = if let Some(corrections) = self.correct_with_dict(word) {
+        let word_case = unicase::UniCase::new(word);
        let mut corrections = if let Some(corrections) = self.correct_with_dict(word_case) {
            if corrections.is_empty() {
                Status::Invalid
            } else {
                self.chain_with_vars(corrections)
            }
        } else {
-            self.correct_with_vars(word)?
+            self.correct_with_vars(word_case)?
        };
        corrections
            .corrections_mut()
@ -53,7 +54,7 @@ impl BuiltIn {
 #[cfg(feature = "dict")]
 impl BuiltIn {
    // Not using `Status` to avoid the allocations
-    fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> {
+    fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
        if typos_dict::WORD_RANGE.contains(&word.len()) {
            map_lookup(&typos_dict::WORD_DICTIONARY, word)
        } else {
@ -64,7 +65,7 @@ impl BuiltIn {
 #[cfg(not(feature = "dict"))]
 impl BuiltIn {
-    fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> {
+    fn correct_with_dict(&self, _word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
        None
    }
 }
@ -75,7 +76,7 @@ impl BuiltIn {
        if self.is_vars_enabled() {
            let mut chained: Vec<_> = corrections
                .iter()
-                .flat_map(|c| match self.correct_with_vars(c) {
+                .flat_map(|c| match self.correct_with_vars(unicase::UniCase::new(c)) {
                    Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
                    Some(Status::Corrections(vars)) => vars,
                    Some(Status::Invalid) => {
@ -94,10 +95,11 @@ impl BuiltIn {
        }
    }
-    fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
+    fn correct_with_vars(&self, word: unicase::UniCase<&str>) -> Option<Status<'static>> {
-        if self.is_vars_enabled() && typos_vars::WORD_RANGE.contains(&word.len()) {
+        if self.is_vars_enabled() {
-            let word_case = unicase::UniCase::new(word);
+            typos_vars::VARS_DICTIONARY
-            typos_vars::find(&word_case).map(|variants| self.select_variant(variants))
+                .find(&word)
                .map(|variants| self.select_variant(variants))
        } else {
            None
        }
@ -158,7 +160,7 @@ impl BuiltIn {
        Status::Corrections(corrections.iter().map(|c| Cow::Borrowed(*c)).collect())
    }
-    fn correct_with_vars(&self, _word: &str) -> Option<Status<'static>> {
+    fn correct_with_vars(&self, _word: unicase::UniCase<&str>) -> Option<Status<'static>> {
        None
    }
 }
@ -173,7 +175,10 @@ impl typos::Dictionary for BuiltIn {
    }
 }
-fn map_lookup<V: Clone>(map: &'static phf::Map<UniCase<&'static str>, V>, key: &str) -> Option<V> {
+fn map_lookup<V: Clone>(
    map: &'static phf::Map<UniCase<&'static str>, V>,
    key: unicase::UniCase<&str>,
 ) -> Option<V> {
    // This transmute should be safe as `get` will not store the reference with
    // the expanded lifetime. This is due to `Borrow` being overly strict and
    // can't have an impl for `&'static str` to `Borrow<&'a str>`.
@ -181,8 +186,8 @@ fn map_lookup<V: Clone>(map: &'static phf::Map<UniCase<&'static str>, V>, key: &
    //
    // See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
    unsafe {
-        let key = ::std::mem::transmute::<_, &'static str>(key);
+        let key = ::std::mem::transmute::<_, unicase::UniCase<&'static str>>(key);
-        map.get(&UniCase::new(key)).cloned()
+        map.get(&key).cloned()
    }
 }