refactor(dict): Allow 0..n corrections in BuiltIn

The main use case is taking `ther` -> `there` and adding `the` and `their`.
2024-11-28 20:11:05 -05:00 · 2021-05-15 19:06:04 -05:00 · 2021-05-15 19:06:04 -05:00 · fb0dac4297
commit fb0dac4297
parent 444d2cca91
6 changed files with 33747 additions and 33668 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1526,6 +1526,7 @@ version = "1.2.0"
 dependencies = [
 "codegenrs",
 "csv",
 "itertools 0.10.0",
 "phf",
 "phf_codegen",
 "structopt",
--- a/crates/typos-dict/codegen/Cargo.toml
+++ b/crates/typos-dict/codegen/Cargo.toml
@ -18,6 +18,7 @@ codecov = { repository = "crate-ci/typos" }
 phf = { version = "0.8", features = ["unicase"] }
 phf_codegen = "0.8"
 csv = "1.1"
 itertools = "0.10"
 unicase = "2.5"
 codegenrs = "1.0"
 structopt = "0.3"
--- a/crates/typos-dict/codegen/src/main.rs
+++ b/crates/typos-dict/codegen/src/main.rs
@ -18,20 +18,26 @@ fn generate<W: std::io::Write>(file: &mut W) {
    writeln!(
        file,
-        "pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
+        "pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
    )
    .unwrap();
    let mut builder = phf_codegen::Map::new();
    let records: Vec<_> = csv::ReaderBuilder::new()
        .has_headers(false)
        .flexible(true)
        .from_reader(DICT)
        .records()
        .map(|r| r.unwrap())
        .collect();
    for record in &records {
-        smallest = std::cmp::min(smallest, record[0].len());
+        let mut record_fields = record.iter();
-        largest = std::cmp::max(largest, record[0].len());
+        let key = record_fields.next().unwrap();
-        let value = format!(r#""{}""#, &record[1]);
+        smallest = std::cmp::min(smallest, key.len());
        largest = std::cmp::max(largest, key.len());
        let value = format!(
            "&[{}]",
            itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
        );
        builder.entry(unicase::UniCase::new(&record[0]), &value);
    }
    let codegenned = builder.build();
--- a/crates/typos-dict/src/dict_codegen.rs
+++ b/crates/typos-dict/src/dict_codegen.rs
--- a/crates/typos-dict/verify/src/main.rs
+++ b/crates/typos-dict/verify/src/main.rs
@ -4,26 +4,32 @@ use std::collections::HashSet;
 use structopt::StructOpt;
 fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
-    let mut wtr = csv::Writer::from_writer(file);
+    let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
    let disallowed_typos = varcon_words();
    let word_variants = proper_word_variants();
    let mut reader = csv::ReaderBuilder::new()
        .has_headers(false)
        .flexible(true)
        .from_reader(dict);
    for record in reader.records() {
        let record = record.unwrap();
-        let typo = &record[0];
+        let mut record_fields = record.iter();
-        let correction = &record[1];
+        let typo = record_fields.next().unwrap();
        if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
            continue;
        }
        let mut row = vec![typo];
        for correction in record_fields {
            let correction = word_variants
                .get(correction)
                .and_then(|words| find_best_match(typo, correction, words))
                .unwrap_or(correction);
-        wtr.write_record(&[typo, correction]).unwrap();
+            row.push(correction);
        }
        wtr.write_record(&row).unwrap();
    }
    wtr.flush().unwrap();
 }
--- a/src/dict.rs
+++ b/src/dict.rs
@ -34,14 +34,11 @@ impl BuiltIn {
        }
        let word = word_token.token();
-        let mut corrections = if let Some(correction) = self.correct_with_dict(word) {
+        let mut corrections = if let Some(corrections) = self.correct_with_dict(word) {
-            match self.correct_with_vars(correction) {
+            if corrections.is_empty() {
-                Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]),
+                Status::Invalid
-                Some(correction @ Status::Corrections(_)) => correction,
+            } else {
-                Some(Status::Invalid) => {
+                self.chain_with_vars(corrections)?
                    unreachable!("correct_with_vars should always have valid suggestions")
                }
                None => Status::Corrections(vec![Cow::Borrowed(correction)]),
            }
        } else {
            self.correct_with_vars(word)?
@ -54,7 +51,7 @@ impl BuiltIn {
    #[cfg(feature = "dict")]
    // Not using `Status` to avoid the allocations
-    fn correct_with_dict(&self, word: &str) -> Option<&'static str> {
+    fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> {
        if typos_dict::WORD_RANGE.contains(&word.len()) {
            map_lookup(&typos_dict::WORD_DICTIONARY, word)
        } else {
@ -63,10 +60,35 @@ impl BuiltIn {
    }
    #[cfg(not(feature = "dict"))]
-    fn correct_with_dict(&self, _word: &str) -> Option<&'static str> {
+    fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> {
        None
    }
    #[cfg(feature = "vars")]
    fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Option<Status<'static>> {
        let mut chained: Vec<_> = corrections
            .iter()
            .flat_map(|c| match self.correct_with_vars(c) {
                Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
                Some(Status::Corrections(vars)) => vars,
                Some(Status::Invalid) => {
                    unreachable!("correct_with_vars should always have valid suggestions")
                }
            })
            .collect();
        if chained.len() != 1 {
            chained.sort_unstable();
            chained.dedup();
        }
        debug_assert!(!chained.is_empty());
        Some(Status::Corrections(chained))
    }
    #[cfg(not(feature = "vars"))]
    fn chain_with_vars(&self, corrections: &[&str]) -> Option<Status<'static>> {
        Status::Corrections(corrections.map(|c| Cow::Borrowed(correction).collect()))
    }
    #[cfg(feature = "vars")]
    fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
        if typos_vars::WORD_RANGE.contains(&word.len()) {