Merge pull request #314 from epage/contrib

docs: Process for adding to dict
This commit is contained in:
Ed Page 2021-07-27 14:41:20 -05:00 committed by GitHub
commit ce3760d125
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 21987 additions and 21941 deletions

View file

@ -27,6 +27,31 @@ Already have an idea? It might be good to first [create an issue][new issue]
to propose it so we can make sure we are aligned and lower the risk of having to propose it so we can make sure we are aligned and lower the risk of having
to re-work some of it and the discouragement that goes along with that. to re-work some of it and the discouragement that goes along with that.
### Adding typos
1. Add your type to our data file `crates/typos-dict/assets/words.csv`
Format: `typo,correction[,correction...]`
2. Verify your change
Run
```bash
cargo run --package typos-dict-verify -- --input crates/typos-dict/assets/words.csv --output crates/typos-dict/assets/words.csv
```
Auto-cleans up your change according to some rules we have like:
- Don't prefer specific dialects in the dictionary, leaving those to [`varcon`](http://wordlist.aspell.net/varcon-readme/).
- Mixing up corrections and typos
- etc
3. Code-gen the dictionary
Run
```bash
cargo run --package typos-dict-codegen -- --output crates/typos-dict/src/dict_codegen.rs
```
(we do development-time code-gen to speed up builds)
### Process ### Process
When you first post a PR, we request that the commit history get cleaned When you first post a PR, we request that the commit history get cleaned

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,34 +1,67 @@
use std::collections::BTreeMap;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use unicase::UniCase;
use structopt::StructOpt; use structopt::StructOpt;
type Dict = BTreeMap<UniCase<String>, Vec<String>>;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) { fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file); let mut rows = Dict::new();
csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(Result::unwrap)
.for_each(|r| {
let mut i = r.iter();
let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase();
let typo = UniCase::new(typo);
rows.entry(typo).or_insert_with(Vec::new).extend(i.map(|c| {
let mut c = c.to_owned();
c.make_ascii_lowercase();
c
}));
});
let disallowed_typos = varcon_words(); let disallowed_typos = varcon_words();
let word_variants = proper_word_variants(); let word_variants = proper_word_variants();
let rows: Dict = rows
.into_iter()
.filter(|(typo, _)| {
let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo));
if is_disallowed {
eprintln!("{:?} is disallowed", typo);
}
!is_disallowed
})
.map(|(typo, corrections)| {
let mut new_corrections = vec![];
for correction in corrections {
let correction = word_variants
.get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction);
new_corrections.push(correction.to_owned());
}
(typo, new_corrections)
})
.collect();
let mut reader = csv::ReaderBuilder::new() let corrections: std::collections::HashSet<_> =
.has_headers(false) rows.values().flatten().map(ToOwned::to_owned).collect();
.flexible(true) let rows: Vec<_> = rows
.from_reader(dict); .into_iter()
for record in reader.records() { .filter(|(typo, _)| !corrections.contains(typo.as_str()))
let record = record.unwrap(); .collect();
let mut record_fields = record.iter();
let typo = record_fields.next().unwrap();
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue;
}
let mut row = vec![typo]; let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
for correction in record_fields { for (typo, corrections) in rows {
let correction = word_variants let mut row = corrections;
.get(correction) row.insert(0, typo.as_str().to_owned());
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
row.push(correction);
}
wtr.write_record(&row).unwrap(); wtr.write_record(&row).unwrap();
} }
wtr.flush().unwrap(); wtr.flush().unwrap();