Merge pull request #314 from epage/contrib

docs: Process for adding to dict
This commit is contained in:
Ed Page 2021-07-27 14:41:20 -05:00 committed by GitHub
commit ce3760d125
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 21987 additions and 21941 deletions

View file

@ -27,6 +27,31 @@ Already have an idea? It might be good to first [create an issue][new issue]
to propose it so we can make sure we are aligned and lower the risk of having
to re-work some of it and the discouragement that goes along with that.
### Adding typos
1. Add your type to our data file `crates/typos-dict/assets/words.csv`
Format: `typo,correction[,correction...]`
2. Verify your change
Run
```bash
cargo run --package typos-dict-verify -- --input crates/typos-dict/assets/words.csv --output crates/typos-dict/assets/words.csv
```
Auto-cleans up your change according to some rules we have like:
- Don't prefer specific dialects in the dictionary, leaving those to [`varcon`](http://wordlist.aspell.net/varcon-readme/).
- Mixing up corrections and typos
- etc
3. Code-gen the dictionary
Run
```bash
cargo run --package typos-dict-codegen -- --output crates/typos-dict/src/dict_codegen.rs
```
(we do development-time code-gen to speed up builds)
### Process
When you first post a PR, we request that the commit history get cleaned

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,34 +1,67 @@
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::collections::HashSet;
use unicase::UniCase;
use structopt::StructOpt;
type Dict = BTreeMap<UniCase<String>, Vec<String>>;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
let mut rows = Dict::new();
csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(Result::unwrap)
.for_each(|r| {
let mut i = r.iter();
let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase();
let typo = UniCase::new(typo);
rows.entry(typo).or_insert_with(Vec::new).extend(i.map(|c| {
let mut c = c.to_owned();
c.make_ascii_lowercase();
c
}));
});
let disallowed_typos = varcon_words();
let word_variants = proper_word_variants();
let rows: Dict = rows
.into_iter()
.filter(|(typo, _)| {
let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo));
if is_disallowed {
eprintln!("{:?} is disallowed", typo);
}
!is_disallowed
})
.map(|(typo, corrections)| {
let mut new_corrections = vec![];
for correction in corrections {
let correction = word_variants
.get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction);
new_corrections.push(correction.to_owned());
}
(typo, new_corrections)
})
.collect();
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict);
for record in reader.records() {
let record = record.unwrap();
let mut record_fields = record.iter();
let typo = record_fields.next().unwrap();
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue;
}
let corrections: std::collections::HashSet<_> =
rows.values().flatten().map(ToOwned::to_owned).collect();
let rows: Vec<_> = rows
.into_iter()
.filter(|(typo, _)| !corrections.contains(typo.as_str()))
.collect();
let mut row = vec![typo];
for correction in record_fields {
let correction = word_variants
.get(correction)
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
row.push(correction);
}
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
for (typo, corrections) in rows {
let mut row = corrections;
row.insert(0, typo.as_str().to_owned());
wtr.write_record(&row).unwrap();
}
wtr.flush().unwrap();