2023-06-26 15:33:59 -04:00
|
|
|
use indexmap::IndexSet;
|
2021-07-27 15:09:51 -04:00
|
|
|
use std::collections::BTreeMap;
|
2020-05-27 21:46:41 -04:00
|
|
|
use std::collections::HashMap;
|
|
|
|
use std::collections::HashSet;
|
2021-07-27 15:09:51 -04:00
|
|
|
use unicase::UniCase;
|
2020-05-27 21:46:41 -04:00
|
|
|
|
2023-06-26 15:33:59 -04:00
|
|
|
type Dict = BTreeMap<UniCase<String>, IndexSet<String>>;
|
2021-07-27 15:09:51 -04:00
|
|
|
|
2022-08-01 15:45:58 -04:00
|
|
|
#[test]
|
|
|
|
fn verify() {
|
2024-02-14 21:28:51 -05:00
|
|
|
let typos_dict = parse_dict("assets/words.csv");
|
2023-06-24 05:14:40 -04:00
|
|
|
let new_dict = process(typos_dict);
|
2022-08-01 15:45:58 -04:00
|
|
|
|
|
|
|
let mut content = vec![];
|
2023-06-24 05:14:40 -04:00
|
|
|
|
|
|
|
let mut wtr = csv::WriterBuilder::new()
|
|
|
|
.flexible(true)
|
|
|
|
.from_writer(&mut content);
|
|
|
|
for (typo, corrections) in new_dict {
|
|
|
|
let mut row = vec![typo.as_str().to_owned()];
|
|
|
|
row.extend(corrections);
|
|
|
|
wtr.write_record(&row).unwrap();
|
|
|
|
}
|
|
|
|
wtr.flush().unwrap();
|
|
|
|
drop(wtr);
|
2022-08-01 15:45:58 -04:00
|
|
|
|
|
|
|
let content = String::from_utf8(content).unwrap();
|
2024-05-27 23:09:20 -04:00
|
|
|
snapbox::assert_data_eq!(content, snapbox::file!["../assets/words.csv"].raw());
|
2022-08-01 15:45:58 -04:00
|
|
|
}
|
|
|
|
|
2023-06-24 05:14:40 -04:00
|
|
|
fn parse_dict(path: &str) -> Vec<(String, Vec<String>)> {
|
|
|
|
let data = std::fs::read(path).unwrap();
|
|
|
|
|
|
|
|
let mut reader = csv::ReaderBuilder::new()
|
2021-07-27 14:15:12 -04:00
|
|
|
.has_headers(false)
|
|
|
|
.flexible(true)
|
2023-06-24 05:14:40 -04:00
|
|
|
.from_reader(&*data);
|
|
|
|
|
|
|
|
reader
|
2021-07-27 14:15:12 -04:00
|
|
|
.records()
|
|
|
|
.map(Result::unwrap)
|
2023-06-24 05:14:40 -04:00
|
|
|
.map(|record| {
|
|
|
|
let mut iter = record.into_iter();
|
|
|
|
let typo = iter.next().expect("typo");
|
|
|
|
(
|
|
|
|
typo.to_owned(),
|
|
|
|
iter.map(ToOwned::to_owned).collect::<Vec<_>>(),
|
|
|
|
)
|
|
|
|
})
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn dict_from_iter<S: Into<String>>(
|
|
|
|
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
|
|
|
|
) -> Dict {
|
|
|
|
let mut dict = Dict::new();
|
|
|
|
|
|
|
|
for (typo, corrections) in iter {
|
|
|
|
let typo = UniCase::new(typo.into().to_ascii_lowercase());
|
|
|
|
|
|
|
|
// duplicate entries are merged
|
|
|
|
dict.entry(typo)
|
2023-06-26 15:33:59 -04:00
|
|
|
.or_default()
|
2023-06-24 05:14:40 -04:00
|
|
|
.extend(corrections.into_iter().map(|c| {
|
|
|
|
let mut c = c.into();
|
|
|
|
c.make_ascii_lowercase();
|
|
|
|
c
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
|
|
|
|
dict
|
|
|
|
}
|
|
|
|
|
|
|
|
fn process<S: Into<String>>(
|
|
|
|
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
|
|
|
|
) -> Dict {
|
|
|
|
let dict = dict_from_iter(iter);
|
|
|
|
|
|
|
|
let rows: Dict = dict
|
2021-07-27 15:40:34 -04:00
|
|
|
.into_iter()
|
|
|
|
.filter(|(t, _)| is_word(t))
|
2023-08-07 17:23:36 -04:00
|
|
|
.map(|(t, c)| {
|
2023-06-26 15:33:59 -04:00
|
|
|
let new_c: IndexSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
|
2023-08-07 17:23:36 -04:00
|
|
|
(t, new_c)
|
2021-07-27 15:40:34 -04:00
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
|
2022-12-06 11:47:08 -05:00
|
|
|
let varcon_words = varcon_words();
|
|
|
|
let allowed_words = allowed_words();
|
2021-05-15 20:29:27 -04:00
|
|
|
let word_variants = proper_word_variants();
|
2023-06-08 09:54:36 -04:00
|
|
|
let rows: Vec<_> = rows
|
2021-07-27 14:15:12 -04:00
|
|
|
.into_iter()
|
2021-07-27 15:09:51 -04:00
|
|
|
.filter(|(typo, _)| {
|
2024-05-02 12:59:32 -04:00
|
|
|
let is_disallowed = varcon_words.contains(&UniCase::new(typo));
|
2021-07-27 14:15:12 -04:00
|
|
|
if is_disallowed {
|
2022-12-06 11:47:08 -05:00
|
|
|
eprintln!("{:?} is disallowed; in varcon", typo);
|
2021-07-27 14:15:12 -04:00
|
|
|
}
|
|
|
|
!is_disallowed
|
|
|
|
})
|
2022-12-06 11:47:08 -05:00
|
|
|
.filter(|(typo, _)| {
|
|
|
|
if let Some(reason) = allowed_words.get(typo.as_ref()) {
|
|
|
|
eprintln!("{:?} is disallowed; {}", typo, reason);
|
|
|
|
false
|
|
|
|
} else {
|
|
|
|
true
|
|
|
|
}
|
|
|
|
})
|
2021-07-27 15:09:51 -04:00
|
|
|
.map(|(typo, corrections)| {
|
2023-06-26 15:33:59 -04:00
|
|
|
let mut new_corrections = IndexSet::new();
|
2021-07-27 15:09:51 -04:00
|
|
|
for correction in corrections {
|
2021-07-27 14:15:12 -04:00
|
|
|
let correction = word_variants
|
|
|
|
.get(correction.as_str())
|
|
|
|
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
|
|
|
|
.unwrap_or(&correction);
|
2023-06-08 09:54:36 -04:00
|
|
|
new_corrections.insert(correction.to_owned());
|
2021-07-27 14:15:12 -04:00
|
|
|
}
|
2021-07-27 15:09:51 -04:00
|
|
|
(typo, new_corrections)
|
2021-07-27 14:15:12 -04:00
|
|
|
})
|
|
|
|
.collect();
|
2023-06-08 09:54:36 -04:00
|
|
|
let mut dict = Dict::new();
|
|
|
|
for (bad, good) in rows {
|
|
|
|
let current = dict.entry(bad).or_default();
|
|
|
|
current.extend(good);
|
|
|
|
}
|
2020-05-27 21:46:41 -04:00
|
|
|
|
2023-06-08 10:23:10 -04:00
|
|
|
let corrections: HashMap<_, _> = dict
|
|
|
|
.iter()
|
|
|
|
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
|
|
|
|
.collect();
|
2023-06-24 05:14:40 -04:00
|
|
|
dict.into_iter()
|
2023-06-08 10:23:10 -04:00
|
|
|
.filter(|(typo, _)| {
|
|
|
|
if let Some(correction) = corrections.get(typo.as_str()) {
|
|
|
|
eprintln!("{typo} <-> {correction} cycle detected");
|
|
|
|
false
|
|
|
|
} else {
|
|
|
|
true
|
|
|
|
}
|
|
|
|
})
|
2023-06-24 05:14:40 -04:00
|
|
|
.collect()
|
2020-05-27 21:46:41 -04:00
|
|
|
}
|
|
|
|
|
2023-06-26 15:33:59 -04:00
|
|
|
#[test]
|
|
|
|
fn test_preserve_correction_order() {
|
|
|
|
let dict = process([("foo", ["xyz", "abc"])]);
|
|
|
|
let mut corrections = dict.get(&UniCase::new("foo".into())).unwrap().iter();
|
|
|
|
assert_eq!(corrections.next().unwrap(), "xyz");
|
|
|
|
assert_eq!(corrections.next().unwrap(), "abc");
|
|
|
|
}
|
|
|
|
|
2023-06-24 06:21:29 -04:00
|
|
|
#[test]
|
|
|
|
fn test_merge_duplicates() {
|
|
|
|
assert_eq!(
|
|
|
|
process([("foo", ["bar"]), ("foo", ["baz"])]),
|
|
|
|
dict_from_iter([("foo", ["bar", "baz"])])
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_duplicate_correction_removal() {
|
|
|
|
let dict = process([("foo", ["bar", "bar"])]);
|
|
|
|
assert_eq!(dict, dict_from_iter([("foo", ["bar"])]));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_cycle_removal() {
|
2023-11-01 12:35:01 -04:00
|
|
|
assert!(process([("foo", ["foobar"]), ("foobar", ["foo"])]).is_empty());
|
2023-06-24 06:21:29 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_varcon_removal() {
|
|
|
|
assert!(process([("colour", ["color"])]).is_empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_varcon_best_match() {
|
|
|
|
assert_eq!(
|
|
|
|
process([(
|
|
|
|
"neighourhood", // note the missing 'b'
|
|
|
|
["neighborhood"],
|
|
|
|
)]),
|
|
|
|
dict_from_iter([(
|
|
|
|
"neighourhood",
|
|
|
|
["neighbourhood"] // note that 'bor' has become 'bour' to match the typo
|
|
|
|
)])
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2021-07-27 15:40:34 -04:00
|
|
|
fn is_word(word: &str) -> bool {
|
|
|
|
word.chars().all(|c| c.is_alphabetic())
|
|
|
|
}
|
|
|
|
|
2024-04-26 22:14:01 -04:00
|
|
|
fn varcon_words() -> HashSet<UniCase<&'static str>> {
|
2021-05-15 20:29:27 -04:00
|
|
|
// Even include improper ones because we should be letting varcon handle that rather than our
|
|
|
|
// dictionary
|
2020-05-27 21:46:41 -04:00
|
|
|
varcon::VARCON
|
|
|
|
.iter()
|
|
|
|
.flat_map(|c| c.entries.iter())
|
|
|
|
.flat_map(|e| e.variants.iter())
|
2024-05-02 12:59:32 -04:00
|
|
|
.map(|v| UniCase::new(v.word))
|
2020-05-27 21:46:41 -04:00
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
2021-05-15 20:29:27 -04:00
|
|
|
fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
|
2020-05-27 21:46:41 -04:00
|
|
|
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
|
|
|
|
for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
|
|
|
|
let variants: HashSet<_> = entry
|
|
|
|
.variants
|
|
|
|
.iter()
|
|
|
|
.filter(|v| v.types.iter().any(|t| t.tag != Some(varcon::Tag::Improper)))
|
|
|
|
.map(|v| v.word)
|
|
|
|
.collect();
|
|
|
|
for variant in variants.iter() {
|
2023-12-28 11:47:51 -05:00
|
|
|
let set = words.entry(variant).or_default();
|
2020-05-27 21:46:41 -04:00
|
|
|
set.extend(variants.iter().filter(|v| *v != variant));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
words
|
|
|
|
}
|
|
|
|
|
|
|
|
fn find_best_match<'c>(
|
|
|
|
typo: &'c str,
|
|
|
|
correction: &'c str,
|
2021-05-15 20:29:27 -04:00
|
|
|
word_variants: &HashSet<&'static str>,
|
2020-05-27 21:46:41 -04:00
|
|
|
) -> Option<&'c str> {
|
2021-05-15 20:29:27 -04:00
|
|
|
assert!(!word_variants.contains(correction));
|
2023-10-02 09:18:50 -04:00
|
|
|
#[allow(clippy::single_match)]
|
|
|
|
match (typo, correction) {
|
|
|
|
// Picking the worst option due to a letter swap being an edit distance of two
|
|
|
|
("alinging", "aligning") => {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
_ => {}
|
|
|
|
}
|
2020-05-27 21:46:41 -04:00
|
|
|
let current = edit_distance::edit_distance(typo, correction);
|
2021-05-15 20:29:27 -04:00
|
|
|
let mut matches: Vec<_> = word_variants
|
2020-05-27 21:46:41 -04:00
|
|
|
.iter()
|
|
|
|
.map(|r| (edit_distance::edit_distance(typo, r), *r))
|
|
|
|
.filter(|(d, _)| *d < current)
|
|
|
|
.collect();
|
|
|
|
matches.sort_unstable();
|
|
|
|
matches.into_iter().next().map(|(_, r)| r)
|
|
|
|
}
|
2022-12-06 11:47:08 -05:00
|
|
|
|
2024-04-26 22:14:01 -04:00
|
|
|
fn allowed_words() -> HashMap<String, String> {
|
2023-11-01 12:35:01 -04:00
|
|
|
let allowed_path = "assets/english.csv";
|
|
|
|
let english_data = std::fs::read(allowed_path).unwrap();
|
|
|
|
let mut allowed_english = csv::ReaderBuilder::new()
|
|
|
|
.has_headers(false)
|
|
|
|
.flexible(true)
|
|
|
|
.from_reader(english_data.as_slice());
|
|
|
|
let allowed_english = allowed_english.records().map(Result::unwrap).map(|r| {
|
|
|
|
let mut i = r.iter();
|
|
|
|
let mut typo = i.next().expect("typo").to_owned();
|
|
|
|
typo.make_ascii_lowercase();
|
|
|
|
(typo, String::from("english word"))
|
|
|
|
});
|
|
|
|
|
2022-12-06 11:47:08 -05:00
|
|
|
let allowed_path = "assets/allowed.csv";
|
2023-11-01 12:35:01 -04:00
|
|
|
let local_data = std::fs::read(allowed_path).unwrap();
|
|
|
|
let mut allowed_local = csv::ReaderBuilder::new()
|
2022-12-06 11:47:08 -05:00
|
|
|
.has_headers(false)
|
|
|
|
.flexible(true)
|
2023-11-01 12:35:01 -04:00
|
|
|
.from_reader(local_data.as_slice());
|
|
|
|
let allowed_local = allowed_local.records().map(Result::unwrap).map(|r| {
|
|
|
|
let mut i = r.iter();
|
|
|
|
let mut typo = i.next().expect("typo").to_owned();
|
|
|
|
typo.make_ascii_lowercase();
|
|
|
|
let reason = i.next().expect("reason").to_owned();
|
|
|
|
(typo, reason)
|
|
|
|
});
|
|
|
|
|
|
|
|
allowed_english.chain(allowed_local).collect()
|
2022-12-06 11:47:08 -05:00
|
|
|
}
|