test: Ensure words.csv stays sorted

This commit is contained in:
Ed Page 2021-07-27 14:09:51 -05:00
parent 41048d15b3
commit 0008713395
2 changed files with 21650 additions and 21655 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,63 +1,63 @@
use std::collections::BTreeMap;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use unicase::UniCase;
use structopt::StructOpt; use structopt::StructOpt;
type Dict = BTreeMap<UniCase<String>, Vec<String>>;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) { fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let rows: Vec<Vec<_>> = csv::ReaderBuilder::new() let mut rows = Dict::new();
csv::ReaderBuilder::new()
.has_headers(false) .has_headers(false)
.flexible(true) .flexible(true)
.from_reader(dict) .from_reader(dict)
.records() .records()
.map(Result::unwrap) .map(Result::unwrap)
.map(|r| { .for_each(|r| {
let row: Vec<String> = r.iter().map(ToOwned::to_owned).collect(); let mut i = r.iter();
row let typo = UniCase::new(i.next().expect("typo").to_owned());
}) rows.entry(typo)
.collect(); .or_insert_with(|| Vec::new())
.extend(i.map(ToOwned::to_owned));
});
let disallowed_typos = varcon_words(); let disallowed_typos = varcon_words();
let word_variants = proper_word_variants(); let word_variants = proper_word_variants();
let rows: Vec<_> = rows let rows: Dict = rows
.into_iter() .into_iter()
.filter(|r| { .filter(|(typo, _)| {
let typo = &r[0];
let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo)); let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo));
if is_disallowed { if is_disallowed {
eprintln!("{:?} is disallowed", typo); eprintln!("{:?} is disallowed", typo);
} }
!is_disallowed !is_disallowed
}) })
.map(|r| { .map(|(typo, corrections)| {
let mut fields = r.into_iter(); let mut new_corrections = vec![];
let typo = fields.next().expect("at least a typo"); for correction in corrections {
let mut row = vec![typo.clone()];
for correction in fields {
let correction = word_variants let correction = word_variants
.get(correction.as_str()) .get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words)) .and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction); .unwrap_or(&correction);
row.push(correction.to_owned()); new_corrections.push(correction.to_owned());
} }
row (typo, new_corrections)
}) })
.collect(); .collect();
let corrections: std::collections::HashSet<_> = rows let corrections: std::collections::HashSet<_> =
.iter() rows.values().flatten().map(ToOwned::to_owned).collect();
.flat_map(|r| {
let mut i = r.iter();
i.next();
i.map(ToOwned::to_owned)
})
.collect();
let rows: Vec<_> = rows let rows: Vec<_> = rows
.into_iter() .into_iter()
.filter(|r| !corrections.contains(&r[0])) .filter(|(typo, _)| !corrections.contains(typo.as_str()))
.collect(); .collect();
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file); let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
for row in rows { for (typo, corrections) in rows {
let mut row = corrections;
row.insert(0, typo.as_str().to_owned());
wtr.write_record(&row).unwrap(); wtr.write_record(&row).unwrap();
} }
wtr.flush().unwrap(); wtr.flush().unwrap();