refactor(dict): Pull out table-lookup logic

Before, only some dicts did we guarentee were pre-sorted.  Now, all are
for-sure pre-sorted.

This also gives each dict the size-check to avoid lookup.

But this is really about refactoring in prep for playing with other
lookup options, like tries.
This commit is contained in:
Ed Page 2021-06-30 10:12:17 -05:00
parent bfa7888f82
commit a1e95bc7c0
22 changed files with 273300 additions and 174464 deletions

17
Cargo.lock generated
View file

@ -223,6 +223,7 @@ name = "codespell-codegen"
version = "0.4.0" version = "0.4.0"
dependencies = [ dependencies = [
"codegenrs", "codegenrs",
"dictgen",
"itertools 0.10.0", "itertools 0.10.0",
"structopt", "structopt",
"unicase", "unicase",
@ -232,7 +233,7 @@ dependencies = [
name = "codespell-dict" name = "codespell-dict"
version = "0.4.0" version = "0.4.0"
dependencies = [ dependencies = [
"log", "dictgen",
"unicase", "unicase",
] ]
@ -412,6 +413,13 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "dictgen"
version = "0.1.0"
dependencies = [
"unicase",
]
[[package]] [[package]]
name = "difference" name = "difference"
version = "2.0.0" version = "2.0.0"
@ -806,6 +814,7 @@ name = "misspell-codegen"
version = "0.4.0" version = "0.4.0"
dependencies = [ dependencies = [
"codegenrs", "codegenrs",
"dictgen",
"itertools 0.10.0", "itertools 0.10.0",
"regex", "regex",
"structopt", "structopt",
@ -816,6 +825,7 @@ dependencies = [
name = "misspell-dict" name = "misspell-dict"
version = "0.4.0" version = "0.4.0"
dependencies = [ dependencies = [
"dictgen",
"log", "log",
"unicase", "unicase",
] ]
@ -1573,6 +1583,7 @@ dependencies = [
name = "typos-vars" name = "typos-vars"
version = "0.6.0" version = "0.6.0"
dependencies = [ dependencies = [
"dictgen",
"log", "log",
"unicase", "unicase",
"varcon-core", "varcon-core",
@ -1585,6 +1596,7 @@ dependencies = [
"clap", "clap",
"clap-verbosity-flag", "clap-verbosity-flag",
"codegenrs", "codegenrs",
"dictgen",
"env_logger 0.7.1", "env_logger 0.7.1",
"itertools 0.10.0", "itertools 0.10.0",
"log", "log",
@ -1774,6 +1786,7 @@ name = "wikipedia-codegen"
version = "0.4.0" version = "0.4.0"
dependencies = [ dependencies = [
"codegenrs", "codegenrs",
"dictgen",
"itertools 0.10.0", "itertools 0.10.0",
"structopt", "structopt",
"unicase", "unicase",
@ -1783,7 +1796,7 @@ dependencies = [
name = "wikipedia-dict" name = "wikipedia-dict"
version = "0.4.0" version = "0.4.0"
dependencies = [ dependencies = [
"log", "dictgen",
"unicase", "unicase",
] ]

View file

@ -3,6 +3,7 @@ members = [
"crates/typos", "crates/typos",
"crates/typos-dict", "crates/typos-dict/codegen", "crates/typos-dict/verify", "crates/typos-dict", "crates/typos-dict/codegen", "crates/typos-dict/verify",
"crates/typos-vars", "crates/typos-vars/codegen", "crates/typos-vars", "crates/typos-vars/codegen",
"crates/dictgen",
"crates/codespell-dict", "crates/codespell-dict/codegen", "crates/codespell-dict", "crates/codespell-dict/codegen",
"crates/misspell-dict", "crates/misspell-dict/codegen", "crates/misspell-dict", "crates/misspell-dict/codegen",
"crates/wikipedia-dict", "crates/wikipedia-dict/codegen", "crates/wikipedia-dict", "crates/wikipedia-dict/codegen",

View file

@ -20,4 +20,4 @@ disable-release = true
[dependencies] [dependencies]
unicase = "2.5" unicase = "2.5"
log = "0.4" dictgen = { version = "0.1", path = "../dictgen" }

View file

@ -22,3 +22,4 @@ unicase = "2.5"
itertools = "0.10" itertools = "0.10"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" }

View file

@ -30,20 +30,17 @@ fn generate<W: std::io::Write>(file: &mut W) {
env!("CARGO_PKG_NAME") env!("CARGO_PKG_NAME")
) )
.unwrap(); .unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file).unwrap(); writeln!(file).unwrap();
let dict = parse_dict(DICT); let dict = parse_dict(DICT);
writeln!(file, "pub static WORD_DICTIONARY: &[(&str, &[&str])] = &[").unwrap(); dictgen::generate_table(
for (typo, corrections) in dict { file,
let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", "); "WORD_DICTIONARY",
let value = format!("&[{}]", value); "&[&str]",
dict.map(|kv| (kv.0, format!("&{:?}", kv.1))),
let key = format!("{:?}", typo); )
writeln!(file, " ({}, {}),", key, &value).unwrap(); .unwrap();
}
writeln!(file, "];").unwrap();
} }
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,7 @@
[package]
name = "dictgen"
version = "0.1.0"
edition = "2018"
[dependencies]
unicase = "2.5"

74
crates/dictgen/src/lib.rs Normal file
View file

@ -0,0 +1,74 @@
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W,
name: &str,
value_type: &str,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
let mut smallest = usize::MAX;
let mut largest = usize::MIN;
writeln!(
file,
"pub static {}: dictgen::DictTable<{}> = dictgen::DictTable {{",
name, value_type
)?;
writeln!(file, " table: &[")?;
for (key, value) in data {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({:?})", key)
} else {
format!("dictgen::InsensitiveStr::Unicode({:?})", key)
};
writeln!(file, " ({}, {}),", key, value)?;
}
writeln!(file, " ],")?;
writeln!(file, " range: {}..={},", smallest, largest)?;
writeln!(file, "}};")?;
Ok(())
}
pub struct DictTable<V: 'static> {
pub table: &'static [(InsensitiveStr, V)],
pub range: std::ops::RangeInclusive<usize>,
}
impl<V> DictTable<V> {
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
self.table
.binary_search_by_key(word, |(key, _)| key.convert())
.map(|i| &self.table[i].1)
.ok()
} else {
None
}
}
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> {
self.table.iter().map(|row| (row.0.convert(), &row.1))
}
}
// Avoid unicase's use of const-fn so large tables don't OOM
#[derive(Copy, Clone, Debug)]
pub enum InsensitiveStr {
Unicode(&'static str),
Ascii(&'static str),
}
impl InsensitiveStr {
fn convert(self) -> unicase::UniCase<&'static str> {
match self {
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
}

View file

@ -21,3 +21,4 @@ disable-release = true
[dependencies] [dependencies]
unicase = "2.5" unicase = "2.5"
log = "0.4" log = "0.4"
dictgen = { version = "0.1", path = "../dictgen" }

View file

@ -23,3 +23,4 @@ itertools = "0.10"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
regex = "1" regex = "1"
dictgen = { version = "0.1", path = "../../dictgen" }

View file

@ -62,7 +62,6 @@ fn generate<W: std::io::Write>(file: &mut W) {
env!("CARGO_PKG_NAME") env!("CARGO_PKG_NAME")
) )
.unwrap(); .unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file).unwrap(); writeln!(file).unwrap();
let Words { let Words {
@ -70,64 +69,32 @@ fn generate<W: std::io::Write>(file: &mut W) {
american, american,
british, british,
} = parse_dict(DICT); } = parse_dict(DICT);
let mut main: Vec<_> = main.into_iter().collect();
main.sort_unstable_by(|a, b| {
unicase::UniCase::new(a.0)
.partial_cmp(&unicase::UniCase::new(b.0))
.unwrap()
});
let mut american: Vec<_> = american.into_iter().collect();
american.sort_unstable_by(|a, b| {
unicase::UniCase::new(a.0)
.partial_cmp(&unicase::UniCase::new(b.0))
.unwrap()
});
let mut british: Vec<_> = british.into_iter().collect();
british.sort_unstable_by(|a, b| {
unicase::UniCase::new(a.0)
.partial_cmp(&unicase::UniCase::new(b.0))
.unwrap()
});
writeln!(file, "pub static MAIN_DICTIONARY: &[(&str, &[&str])] = &[").unwrap(); dictgen::generate_table(
for (typo, corrections) in main.into_iter() {
let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
let value = format!("&[{}]", value);
let key = format!("{:?}", typo);
writeln!(file, " ({}, {}),", key, &value).unwrap();
}
writeln!(file, "];").unwrap();
writeln!(file).unwrap();
writeln!(
file, file,
"pub static AMERICAN_DICTIONARY: &[(&str, &[&str])] = &[" "MAIN_DICTIONARY",
"&[&str]",
main.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))),
) )
.unwrap(); .unwrap();
for (typo, corrections) in american.into_iter() {
let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
let value = format!("&[{}]", value);
let key = format!("{:?}", typo); dictgen::generate_table(
writeln!(file, " ({}, {}),", key, &value).unwrap();
}
writeln!(file, "];").unwrap();
writeln!(file).unwrap();
writeln!(
file, file,
"pub static BRITISH_DICTIONARY: &[(&str, &[&str])] = &[" "AMERICAN_DICTIONARY",
"&[&str]",
american
.into_iter()
.map(|kv| (kv.0, format!("&{:?}", kv.1))),
) )
.unwrap(); .unwrap();
for (typo, corrections) in british.into_iter() {
let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
let value = format!("&[{}]", value);
let key = format!("{:?}", typo); dictgen::generate_table(
writeln!(file, " ({}, {}),", key, &value).unwrap(); file,
} "BRITISH_DICTIONARY",
writeln!(file, "];").unwrap(); "&[&str]",
british.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))),
)
.unwrap();
} }
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]

File diff suppressed because it is too large Load diff

View file

@ -17,4 +17,5 @@ codecov = { repository = "crate-ci/typos" }
[dependencies] [dependencies]
unicase = "2.5" unicase = "2.5"
log = "0.4" log = "0.4"
dictgen = { version = "0.1", path = "../dictgen" }
varcon-core = { version = "^2.0", path = "../varcon-core", features = ["flags"] } varcon-core = { version = "^2.0", path = "../varcon-core", features = ["flags"] }

View file

@ -29,3 +29,4 @@ log = "0.4"
env_logger = "0.7" env_logger = "0.7"
clap-verbosity-flag = "0.3" clap-verbosity-flag = "0.3"
itertools = "0.10" itertools = "0.10"
dictgen = { version = "0.1", path = "../../dictgen" }

View file

@ -73,46 +73,27 @@ fn generate_variations<W: std::io::Write>(file: &mut W) {
writeln!(file, "}}").unwrap(); writeln!(file, "}}").unwrap();
writeln!(file).unwrap(); writeln!(file).unwrap();
let mut smallest = usize::MAX;
let mut largest = usize::MIN;
let mut no_invalid = true;
writeln!(
file,
"pub(crate) static VARS_DICTIONARY: &[(crate::EncodedStr, &[(u8, &VariantsMap)])] = &["
)
.unwrap();
let entry_sets = entry_sets(entries.iter()); let entry_sets = entry_sets(entries.iter());
let mut referenced_symbols: HashSet<&str> = HashSet::new(); let mut referenced_symbols: HashSet<&str> = HashSet::new();
for (word, data) in entry_sets.iter() { dictgen::generate_table(
if is_always_valid(data) {
// No need to convert from current form to target form
continue;
}
referenced_symbols.extend(data.iter().map(|(s, _)| s));
let value = generate_link(&data);
let word = unicase::UniCase::new(word);
let key = if word.is_ascii() {
format!("crate::EncodedStr::Ascii({:?})", word)
} else {
format!("crate::EncodedStr::Unicode({:?})", word)
};
writeln!(file, " ({}, {}),", key, &value).unwrap();
smallest = std::cmp::min(smallest, word.len());
largest = std::cmp::max(largest, word.len());
no_invalid &= !is_always_invalid(data);
}
writeln!(file, "];").unwrap();
writeln!(file).unwrap();
writeln!(
file, file,
"pub const WORD_RANGE: std::ops::RangeInclusive<usize> = {}..={};", "VARS_DICTIONARY",
smallest, largest "&[(u8, &VariantsMap)]",
entry_sets.iter().flat_map(|kv| {
let (word, data) = kv;
if is_always_valid(data) {
// No need to convert from current form to target form
None
} else {
referenced_symbols.extend(data.iter().map(|(s, _)| s));
let value = generate_link(&data);
Some((*word, value))
}
}),
) )
.unwrap(); .unwrap();
let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data));
writeln!(file).unwrap(); writeln!(file).unwrap();
writeln!(file, "pub const NO_INVALID: bool = {:?};", no_invalid,).unwrap(); writeln!(file, "pub const NO_INVALID: bool = {:?};", no_invalid,).unwrap();

View file

@ -4,25 +4,3 @@ pub use crate::vars_codegen::*;
pub use varcon_core::Category; pub use varcon_core::Category;
pub use varcon_core::CategorySet; pub use varcon_core::CategorySet;
pub fn find(word: &'_ unicase::UniCase<&str>) -> Option<&'static [(u8, &'static VariantsMap)]> {
VARS_DICTIONARY
.binary_search_by_key(word, |(key, _)| key.convert())
.map(|i| VARS_DICTIONARY[i].1)
.ok()
}
#[derive(Copy, Clone, Debug)]
pub(crate) enum EncodedStr {
//Unicode(&'static str),
Ascii(&'static str),
}
impl EncodedStr {
fn convert(self) -> unicase::UniCase<&'static str> {
match self {
//EncodedStr::Unicode(s) => unicase::UniCase::unicode(s),
EncodedStr::Ascii(s) => unicase::UniCase::ascii(s),
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -20,4 +20,4 @@ disable-release = true
[dependencies] [dependencies]
unicase = "2.5" unicase = "2.5"
log = "0.4" dictgen = { version = "0.1", path = "../dictgen" }

View file

@ -22,3 +22,4 @@ unicase = "2.5"
itertools = "0.10" itertools = "0.10"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" }

View file

@ -30,20 +30,17 @@ fn generate<W: std::io::Write>(file: &mut W) {
env!("CARGO_PKG_NAME") env!("CARGO_PKG_NAME")
) )
.unwrap(); .unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file).unwrap(); writeln!(file).unwrap();
let dict = parse_dict(DICT); let dict = parse_dict(DICT);
writeln!(file, "pub static WORD_DICTIONARY: &[(&str, &[&str])] = &[").unwrap(); dictgen::generate_table(
for (typo, corrections) in dict { file,
let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", "); "WORD_DICTIONARY",
let value = format!("&[{}]", value); "&[&str]",
dict.map(|kv| (kv.0, format!("&{:?}", kv.1))),
let key = format!("{:?}", typo); )
writeln!(file, " ({}, {}),", key, &value).unwrap(); .unwrap();
}
writeln!(file, "];").unwrap();
} }
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]

File diff suppressed because it is too large Load diff

View file

@ -34,14 +34,15 @@ impl BuiltIn {
} }
let word = word_token.token(); let word = word_token.token();
let mut corrections = if let Some(corrections) = self.correct_with_dict(word) { let word_case = unicase::UniCase::new(word);
let mut corrections = if let Some(corrections) = self.correct_with_dict(word_case) {
if corrections.is_empty() { if corrections.is_empty() {
Status::Invalid Status::Invalid
} else { } else {
self.chain_with_vars(corrections) self.chain_with_vars(corrections)
} }
} else { } else {
self.correct_with_vars(word)? self.correct_with_vars(word_case)?
}; };
corrections corrections
.corrections_mut() .corrections_mut()
@ -53,7 +54,7 @@ impl BuiltIn {
#[cfg(feature = "dict")] #[cfg(feature = "dict")]
impl BuiltIn { impl BuiltIn {
// Not using `Status` to avoid the allocations // Not using `Status` to avoid the allocations
fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> { fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
if typos_dict::WORD_RANGE.contains(&word.len()) { if typos_dict::WORD_RANGE.contains(&word.len()) {
map_lookup(&typos_dict::WORD_DICTIONARY, word) map_lookup(&typos_dict::WORD_DICTIONARY, word)
} else { } else {
@ -64,7 +65,7 @@ impl BuiltIn {
#[cfg(not(feature = "dict"))] #[cfg(not(feature = "dict"))]
impl BuiltIn { impl BuiltIn {
fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> { fn correct_with_dict(&self, _word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
None None
} }
} }
@ -75,7 +76,7 @@ impl BuiltIn {
if self.is_vars_enabled() { if self.is_vars_enabled() {
let mut chained: Vec<_> = corrections let mut chained: Vec<_> = corrections
.iter() .iter()
.flat_map(|c| match self.correct_with_vars(c) { .flat_map(|c| match self.correct_with_vars(unicase::UniCase::new(c)) {
Some(Status::Valid) | None => vec![Cow::Borrowed(*c)], Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
Some(Status::Corrections(vars)) => vars, Some(Status::Corrections(vars)) => vars,
Some(Status::Invalid) => { Some(Status::Invalid) => {
@ -94,10 +95,11 @@ impl BuiltIn {
} }
} }
fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> { fn correct_with_vars(&self, word: unicase::UniCase<&str>) -> Option<Status<'static>> {
if self.is_vars_enabled() && typos_vars::WORD_RANGE.contains(&word.len()) { if self.is_vars_enabled() {
let word_case = unicase::UniCase::new(word); typos_vars::VARS_DICTIONARY
typos_vars::find(&word_case).map(|variants| self.select_variant(variants)) .find(&word)
.map(|variants| self.select_variant(variants))
} else { } else {
None None
} }
@ -158,7 +160,7 @@ impl BuiltIn {
Status::Corrections(corrections.iter().map(|c| Cow::Borrowed(*c)).collect()) Status::Corrections(corrections.iter().map(|c| Cow::Borrowed(*c)).collect())
} }
fn correct_with_vars(&self, _word: &str) -> Option<Status<'static>> { fn correct_with_vars(&self, _word: unicase::UniCase<&str>) -> Option<Status<'static>> {
None None
} }
} }
@ -173,7 +175,10 @@ impl typos::Dictionary for BuiltIn {
} }
} }
fn map_lookup<V: Clone>(map: &'static phf::Map<UniCase<&'static str>, V>, key: &str) -> Option<V> { fn map_lookup<V: Clone>(
map: &'static phf::Map<UniCase<&'static str>, V>,
key: unicase::UniCase<&str>,
) -> Option<V> {
// This transmute should be safe as `get` will not store the reference with // This transmute should be safe as `get` will not store the reference with
// the expanded lifetime. This is due to `Borrow` being overly strict and // the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`. // can't have an impl for `&'static str` to `Borrow<&'a str>`.
@ -181,8 +186,8 @@ fn map_lookup<V: Clone>(map: &'static phf::Map<UniCase<&'static str>, V>, key: &
// //
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548 // See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
unsafe { unsafe {
let key = ::std::mem::transmute::<_, &'static str>(key); let key = ::std::mem::transmute::<_, unicase::UniCase<&'static str>>(key);
map.get(&UniCase::new(key)).cloned() map.get(&key).cloned()
} }
} }