typos/crates/typos-dict/tests/verify.rs

140 lines
4.4 KiB
Rust
Raw Normal View History

2021-07-27 15:09:51 -04:00
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::collections::HashSet;
2021-07-27 15:09:51 -04:00
use unicase::UniCase;
2021-07-27 15:09:51 -04:00
type Dict = BTreeMap<UniCase<String>, Vec<String>>;
2022-08-01 15:45:58 -04:00
#[test]
fn verify() {
let asset_path = "assets/words.csv";
let data = std::fs::read(asset_path).unwrap();
let mut content = vec![];
generate(&mut content, &data);
let content = String::from_utf8(content).unwrap();
snapbox::assert_eq_path(asset_path, content);
}
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
2021-07-27 15:09:51 -04:00
let mut rows = Dict::new();
csv::ReaderBuilder::new()
2021-07-27 14:15:12 -04:00
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(Result::unwrap)
2021-07-27 15:09:51 -04:00
.for_each(|r| {
let mut i = r.iter();
let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase();
let typo = UniCase::new(typo);
2021-07-27 15:28:16 -04:00
rows.entry(typo).or_insert_with(Vec::new).extend(i.map(|c| {
let mut c = c.to_owned();
c.make_ascii_lowercase();
c
}));
2021-07-27 15:09:51 -04:00
});
2021-07-27 15:40:34 -04:00
let rows: Dict = rows
.into_iter()
.filter(|(t, _)| is_word(t))
.filter_map(|(t, c)| {
let new_c: Vec<_> = c.into_iter().filter(|c| is_word(c)).collect();
if new_c.is_empty() {
None
} else {
Some((t, new_c))
}
})
.collect();
let disallowed_typos = varcon_words();
let word_variants = proper_word_variants();
2021-07-27 15:09:51 -04:00
let rows: Dict = rows
2021-07-27 14:15:12 -04:00
.into_iter()
2021-07-27 15:09:51 -04:00
.filter(|(typo, _)| {
2021-07-27 14:15:12 -04:00
let is_disallowed = disallowed_typos.contains(&unicase::UniCase::new(typo));
if is_disallowed {
eprintln!("{:?} is disallowed", typo);
}
!is_disallowed
})
2021-07-27 15:09:51 -04:00
.map(|(typo, corrections)| {
let mut new_corrections = vec![];
for correction in corrections {
2021-07-27 14:15:12 -04:00
let correction = word_variants
.get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction);
2021-07-27 15:09:51 -04:00
new_corrections.push(correction.to_owned());
2021-07-27 14:15:12 -04:00
}
2021-07-27 15:09:51 -04:00
(typo, new_corrections)
2021-07-27 14:15:12 -04:00
})
.collect();
2021-07-27 15:09:51 -04:00
let corrections: std::collections::HashSet<_> =
rows.values().flatten().map(ToOwned::to_owned).collect();
2021-07-27 14:15:12 -04:00
let rows: Vec<_> = rows
.into_iter()
2021-07-27 15:09:51 -04:00
.filter(|(typo, _)| !corrections.contains(typo.as_str()))
2021-07-27 14:15:12 -04:00
.collect();
2021-07-27 14:15:12 -04:00
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
2021-07-27 15:09:51 -04:00
for (typo, corrections) in rows {
let mut row = corrections;
row.insert(0, typo.as_str().to_owned());
wtr.write_record(&row).unwrap();
}
wtr.flush().unwrap();
}
2021-07-27 15:40:34 -04:00
fn is_word(word: &str) -> bool {
word.chars().all(|c| c.is_alphabetic())
}
fn varcon_words() -> HashSet<unicase::UniCase<&'static str>> {
// Even include improper ones because we should be letting varcon handle that rather than our
// dictionary
varcon::VARCON
.iter()
.flat_map(|c| c.entries.iter())
.flat_map(|e| e.variants.iter())
.map(|v| unicase::UniCase::new(v.word))
.collect()
}
fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
let variants: HashSet<_> = entry
.variants
.iter()
.filter(|v| v.types.iter().any(|t| t.tag != Some(varcon::Tag::Improper)))
.map(|v| v.word)
.collect();
for variant in variants.iter() {
let set = words.entry(variant).or_insert_with(HashSet::new);
set.extend(variants.iter().filter(|v| *v != variant));
}
}
words
}
fn find_best_match<'c>(
typo: &'c str,
correction: &'c str,
word_variants: &HashSet<&'static str>,
) -> Option<&'c str> {
assert!(!word_variants.contains(correction));
let current = edit_distance::edit_distance(typo, correction);
let mut matches: Vec<_> = word_variants
.iter()
.map(|r| (edit_distance::edit_distance(typo, r), *r))
.filter(|(d, _)| *d < current)
.collect();
matches.sort_unstable();
matches.into_iter().next().map(|(_, r)| r)
}