test: Prevent correcting corrections

This commit is contained in:
Ed Page 2021-07-27 13:15:12 -05:00
parent fc4ec0e4a1
commit 41048d15b3

View file

@ -4,31 +4,60 @@ use std::collections::HashSet;
use structopt::StructOpt; use structopt::StructOpt;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) { fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file); let rows: Vec<Vec<_>> = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(Result::unwrap)
.map(|r| {
let row: Vec<String> = r.iter().map(ToOwned::to_owned).collect();
row
})
.collect();
let disallowed_typos = varcon_words(); let disallowed_typos = varcon_words();
let word_variants = proper_word_variants(); let word_variants = proper_word_variants();
let rows: Vec<_> = rows
.into_iter()
.filter(|r| {
let typo = &r[0];
let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo));
if is_disallowed {
eprintln!("{:?} is disallowed", typo);
}
!is_disallowed
})
.map(|r| {
let mut fields = r.into_iter();
let typo = fields.next().expect("at least a typo");
let mut row = vec![typo.clone()];
for correction in fields {
let correction = word_variants
.get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction);
row.push(correction.to_owned());
}
row
})
.collect();
let mut reader = csv::ReaderBuilder::new() let corrections: std::collections::HashSet<_> = rows
.has_headers(false) .iter()
.flexible(true) .flat_map(|r| {
.from_reader(dict); let mut i = r.iter();
for record in reader.records() { i.next();
let record = record.unwrap(); i.map(ToOwned::to_owned)
let mut record_fields = record.iter(); })
let typo = record_fields.next().unwrap(); .collect();
if disallowed_typos.contains(&unicase::UniCase::new(typo)) { let rows: Vec<_> = rows
continue; .into_iter()
} .filter(|r| !corrections.contains(&r[0]))
.collect();
let mut row = vec![typo]; let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
for correction in record_fields { for row in rows {
let correction = word_variants
.get(correction)
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
row.push(correction);
}
wtr.write_record(&row).unwrap(); wtr.write_record(&row).unwrap();
} }
wtr.flush().unwrap(); wtr.flush().unwrap();