2021-07-27 15:09:51 -04:00
|
|
|
use std::collections::BTreeMap;
|
2023-06-08 09:54:36 -04:00
|
|
|
use std::collections::BTreeSet;
|
2020-05-27 21:46:41 -04:00
|
|
|
use std::collections::HashMap;
|
|
|
|
use std::collections::HashSet;
|
2021-07-27 15:09:51 -04:00
|
|
|
use unicase::UniCase;
|
2020-05-27 21:46:41 -04:00
|
|
|
|
2023-06-08 09:54:36 -04:00
|
|
|
type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;
|
2021-07-27 15:09:51 -04:00
|
|
|
|
2022-08-01 15:45:58 -04:00
|
|
|
#[test]
|
|
|
|
fn verify() {
|
|
|
|
let asset_path = "assets/words.csv";
|
|
|
|
let data = std::fs::read(asset_path).unwrap();
|
|
|
|
|
|
|
|
let mut content = vec![];
|
|
|
|
generate(&mut content, &data);
|
|
|
|
|
|
|
|
let content = String::from_utf8(content).unwrap();
|
|
|
|
snapbox::assert_eq_path(asset_path, content);
|
|
|
|
}
|
|
|
|
|
2020-05-27 21:46:41 -04:00
|
|
|
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
2021-07-27 15:09:51 -04:00
|
|
|
let mut rows = Dict::new();
|
|
|
|
csv::ReaderBuilder::new()
|
2021-07-27 14:15:12 -04:00
|
|
|
.has_headers(false)
|
|
|
|
.flexible(true)
|
|
|
|
.from_reader(dict)
|
|
|
|
.records()
|
|
|
|
.map(Result::unwrap)
|
2021-07-27 15:09:51 -04:00
|
|
|
.for_each(|r| {
|
|
|
|
let mut i = r.iter();
|
2021-07-27 15:11:52 -04:00
|
|
|
let mut typo = i.next().expect("typo").to_owned();
|
|
|
|
typo.make_ascii_lowercase();
|
|
|
|
let typo = UniCase::new(typo);
|
2023-06-08 09:54:36 -04:00
|
|
|
rows.entry(typo)
|
|
|
|
.or_insert_with(BTreeSet::new)
|
|
|
|
.extend(i.map(|c| {
|
|
|
|
let mut c = c.to_owned();
|
|
|
|
c.make_ascii_lowercase();
|
|
|
|
c
|
|
|
|
}));
|
2021-07-27 15:09:51 -04:00
|
|
|
});
|
2020-05-27 21:46:41 -04:00
|
|
|
|
2021-07-27 15:40:34 -04:00
|
|
|
let rows: Dict = rows
|
|
|
|
.into_iter()
|
|
|
|
.filter(|(t, _)| is_word(t))
|
|
|
|
.filter_map(|(t, c)| {
|
2023-06-08 09:54:36 -04:00
|
|
|
let new_c: BTreeSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
|
2021-07-27 15:40:34 -04:00
|
|
|
if new_c.is_empty() {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
Some((t, new_c))
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
|
2022-12-06 11:47:08 -05:00
|
|
|
let varcon_words = varcon_words();
|
|
|
|
let allowed_words = allowed_words();
|
2021-05-15 20:29:27 -04:00
|
|
|
let word_variants = proper_word_variants();
|
2023-06-08 09:54:36 -04:00
|
|
|
let rows: Vec<_> = rows
|
2021-07-27 14:15:12 -04:00
|
|
|
.into_iter()
|
2021-07-27 15:09:51 -04:00
|
|
|
.filter(|(typo, _)| {
|
2022-12-06 11:47:08 -05:00
|
|
|
let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo));
|
2021-07-27 14:15:12 -04:00
|
|
|
if is_disallowed {
|
2022-12-06 11:47:08 -05:00
|
|
|
eprintln!("{:?} is disallowed; in varcon", typo);
|
2021-07-27 14:15:12 -04:00
|
|
|
}
|
|
|
|
!is_disallowed
|
|
|
|
})
|
2022-12-06 11:47:08 -05:00
|
|
|
.filter(|(typo, _)| {
|
|
|
|
if let Some(reason) = allowed_words.get(typo.as_ref()) {
|
|
|
|
eprintln!("{:?} is disallowed; {}", typo, reason);
|
|
|
|
false
|
|
|
|
} else {
|
|
|
|
true
|
|
|
|
}
|
|
|
|
})
|
2021-07-27 15:09:51 -04:00
|
|
|
.map(|(typo, corrections)| {
|
2023-06-08 09:54:36 -04:00
|
|
|
let mut new_corrections = BTreeSet::new();
|
2021-07-27 15:09:51 -04:00
|
|
|
for correction in corrections {
|
2021-07-27 14:15:12 -04:00
|
|
|
let correction = word_variants
|
|
|
|
.get(correction.as_str())
|
|
|
|
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
|
|
|
|
.unwrap_or(&correction);
|
2023-06-08 09:54:36 -04:00
|
|
|
new_corrections.insert(correction.to_owned());
|
2021-07-27 14:15:12 -04:00
|
|
|
}
|
2021-07-27 15:09:51 -04:00
|
|
|
(typo, new_corrections)
|
2021-07-27 14:15:12 -04:00
|
|
|
})
|
|
|
|
.collect();
|
2023-06-08 09:54:36 -04:00
|
|
|
let mut dict = Dict::new();
|
|
|
|
for (bad, good) in rows {
|
|
|
|
let current = dict.entry(bad).or_default();
|
|
|
|
current.extend(good);
|
|
|
|
}
|
2020-05-27 21:46:41 -04:00
|
|
|
|
2023-06-08 10:23:10 -04:00
|
|
|
let corrections: HashMap<_, _> = dict
|
|
|
|
.iter()
|
|
|
|
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
|
|
|
|
.collect();
|
2023-06-08 09:54:36 -04:00
|
|
|
let rows: Vec<_> = dict
|
2021-07-27 14:15:12 -04:00
|
|
|
.into_iter()
|
2023-06-08 10:23:10 -04:00
|
|
|
.filter(|(typo, _)| {
|
|
|
|
if let Some(correction) = corrections.get(typo.as_str()) {
|
|
|
|
eprintln!("{typo} <-> {correction} cycle detected");
|
|
|
|
false
|
|
|
|
} else {
|
|
|
|
true
|
|
|
|
}
|
|
|
|
})
|
2021-07-27 14:15:12 -04:00
|
|
|
.collect();
|
2021-05-15 20:06:04 -04:00
|
|
|
|
2021-07-27 14:15:12 -04:00
|
|
|
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
|
2021-07-27 15:09:51 -04:00
|
|
|
for (typo, corrections) in rows {
|
2023-06-08 09:54:36 -04:00
|
|
|
let mut row = vec![typo.as_str().to_owned()];
|
|
|
|
row.extend(corrections);
|
2021-05-15 20:06:04 -04:00
|
|
|
wtr.write_record(&row).unwrap();
|
2020-05-27 21:46:41 -04:00
|
|
|
}
|
|
|
|
wtr.flush().unwrap();
|
|
|
|
}
|
|
|
|
|
2021-07-27 15:40:34 -04:00
|
|
|
fn is_word(word: &str) -> bool {
|
|
|
|
word.chars().all(|c| c.is_alphabetic())
|
|
|
|
}
|
|
|
|
|
2021-05-15 20:29:27 -04:00
|
|
|
fn varcon_words() -> HashSet<unicase::UniCase<&'static str>> {
|
|
|
|
// Even include improper ones because we should be letting varcon handle that rather than our
|
|
|
|
// dictionary
|
2020-05-27 21:46:41 -04:00
|
|
|
varcon::VARCON
|
|
|
|
.iter()
|
|
|
|
.flat_map(|c| c.entries.iter())
|
|
|
|
.flat_map(|e| e.variants.iter())
|
|
|
|
.map(|v| unicase::UniCase::new(v.word))
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
2021-05-15 20:29:27 -04:00
|
|
|
fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
|
2020-05-27 21:46:41 -04:00
|
|
|
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
|
|
|
|
for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
|
|
|
|
let variants: HashSet<_> = entry
|
|
|
|
.variants
|
|
|
|
.iter()
|
|
|
|
.filter(|v| v.types.iter().any(|t| t.tag != Some(varcon::Tag::Improper)))
|
|
|
|
.map(|v| v.word)
|
|
|
|
.collect();
|
|
|
|
for variant in variants.iter() {
|
|
|
|
let set = words.entry(variant).or_insert_with(HashSet::new);
|
|
|
|
set.extend(variants.iter().filter(|v| *v != variant));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
words
|
|
|
|
}
|
|
|
|
|
|
|
|
fn find_best_match<'c>(
|
|
|
|
typo: &'c str,
|
|
|
|
correction: &'c str,
|
2021-05-15 20:29:27 -04:00
|
|
|
word_variants: &HashSet<&'static str>,
|
2020-05-27 21:46:41 -04:00
|
|
|
) -> Option<&'c str> {
|
2021-05-15 20:29:27 -04:00
|
|
|
assert!(!word_variants.contains(correction));
|
2020-05-27 21:46:41 -04:00
|
|
|
let current = edit_distance::edit_distance(typo, correction);
|
2021-05-15 20:29:27 -04:00
|
|
|
let mut matches: Vec<_> = word_variants
|
2020-05-27 21:46:41 -04:00
|
|
|
.iter()
|
|
|
|
.map(|r| (edit_distance::edit_distance(typo, r), *r))
|
|
|
|
.filter(|(d, _)| *d < current)
|
|
|
|
.collect();
|
|
|
|
matches.sort_unstable();
|
|
|
|
matches.into_iter().next().map(|(_, r)| r)
|
|
|
|
}
|
2022-12-06 11:47:08 -05:00
|
|
|
|
|
|
|
fn allowed_words() -> std::collections::HashMap<String, String> {
|
|
|
|
let allowed_path = "assets/allowed.csv";
|
|
|
|
let data = std::fs::read(allowed_path).unwrap();
|
|
|
|
csv::ReaderBuilder::new()
|
|
|
|
.has_headers(false)
|
|
|
|
.flexible(true)
|
|
|
|
.from_reader(data.as_slice())
|
|
|
|
.records()
|
|
|
|
.map(Result::unwrap)
|
|
|
|
.map(|r| {
|
|
|
|
let mut i = r.iter();
|
|
|
|
let mut typo = i.next().expect("typo").to_owned();
|
|
|
|
typo.make_ascii_lowercase();
|
|
|
|
let reason = i.next().expect("reason").to_owned();
|
|
|
|
(typo, reason)
|
|
|
|
})
|
|
|
|
.collect()
|
|
|
|
}
|