Merge pull request #314 from epage/contrib

docs: Process for adding to dict
2024-11-28 20:11:05 -05:00 · 2021-07-27 14:41:20 -05:00 · 2021-07-27 14:41:20 -05:00 · ce3760d125
commit ce3760d125
parent 18626e5d2d 6037eebfdc
4 changed files with 21987 additions and 21941 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -27,6 +27,31 @@ Already have an idea?  It might be good to first [create an issue][new issue]
 to propose it so we can make sure we are aligned and lower the risk of having
 to re-work some of it and the discouragement that goes along with that.

+### Adding typos
+
+1. Add your type to our data file `crates/typos-dict/assets/words.csv`
+
+Format: `typo,correction[,correction...]`
+
+2. Verify your change
+
+Run
+```bash
+cargo run --package typos-dict-verify -- --input crates/typos-dict/assets/words.csv --output crates/typos-dict/assets/words.csv
+```
+Auto-cleans up your change according to some rules we have like:
+- Don't prefer specific dialects in the dictionary, leaving those to [`varcon`](http://wordlist.aspell.net/varcon-readme/).
+- Mixing up corrections and typos
+- etc
+
+3. Code-gen the dictionary
+
+Run
+```bash
+cargo run --package typos-dict-codegen -- --output crates/typos-dict/src/dict_codegen.rs
+```
+(we do development-time code-gen to speed up builds)
+
 ### Process

 When you first post a PR, we request that the commit history get cleaned
--- a/crates/typos-dict/assets/words.csv
+++ b/crates/typos-dict/assets/words.csv
--- a/crates/typos-dict/src/dict_codegen.rs
+++ b/crates/typos-dict/src/dict_codegen.rs
--- a/crates/typos-dict/verify/src/main.rs
+++ b/crates/typos-dict/verify/src/main.rs
@ -1,34 +1,67 @@
+use std::collections::BTreeMap;
 use std::collections::HashMap;
 use std::collections::HashSet;
+use unicase::UniCase;

 use structopt::StructOpt;

+type Dict = BTreeMap<UniCase<String>, Vec<String>>;
+
 fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
-    let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
+    let mut rows = Dict::new();
+    csv::ReaderBuilder::new()
+        .has_headers(false)
+        .flexible(true)
+        .from_reader(dict)
+        .records()
+        .map(Result::unwrap)
+        .for_each(|r| {
+            let mut i = r.iter();
+            let mut typo = i.next().expect("typo").to_owned();
+            typo.make_ascii_lowercase();
+            let typo = UniCase::new(typo);
+            rows.entry(typo).or_insert_with(Vec::new).extend(i.map(|c| {
+                let mut c = c.to_owned();
+                c.make_ascii_lowercase();
+                c
+            }));
+        });

    let disallowed_typos = varcon_words();
    let word_variants = proper_word_variants();
-
-    let mut reader = csv::ReaderBuilder::new()
-        .has_headers(false)
-        .flexible(true)
-        .from_reader(dict);
-    for record in reader.records() {
-        let record = record.unwrap();
-        let mut record_fields = record.iter();
-        let typo = record_fields.next().unwrap();
-        if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
-            continue;
+    let rows: Dict = rows
+        .into_iter()
+        .filter(|(typo, _)| {
+            let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo));
+            if is_disallowed {
+                eprintln!("{:?} is disallowed", typo);
            }
-
-        let mut row = vec![typo];
-        for correction in record_fields {
+            !is_disallowed
+        })
+        .map(|(typo, corrections)| {
+            let mut new_corrections = vec![];
+            for correction in corrections {
                let correction = word_variants
-                .get(correction)
-                .and_then(|words| find_best_match(typo, correction, words))
-                .unwrap_or(correction);
-            row.push(correction);
+                    .get(correction.as_str())
+                    .and_then(|words| find_best_match(&typo, correction.as_str(), words))
+                    .unwrap_or(&correction);
+                new_corrections.push(correction.to_owned());
            }
+            (typo, new_corrections)
+        })
+        .collect();
+
+    let corrections: std::collections::HashSet<_> =
+        rows.values().flatten().map(ToOwned::to_owned).collect();
+    let rows: Vec<_> = rows
+        .into_iter()
+        .filter(|(typo, _)| !corrections.contains(typo.as_str()))
+        .collect();
+
+    let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
+    for (typo, corrections) in rows {
+        let mut row = corrections;
+        row.insert(0, typo.as_str().to_owned());
        wtr.write_record(&row).unwrap();
    }
    wtr.flush().unwrap();