typos/crates/typos-dict/tests/verify.rs

279 lines
8.2 KiB
Rust
Raw Normal View History

use indexmap::IndexSet;
2021-07-27 15:09:51 -04:00
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::collections::HashSet;
2021-07-27 15:09:51 -04:00
use unicase::UniCase;
type Dict = BTreeMap<UniCase<String>, IndexSet<String>>;
2021-07-27 15:09:51 -04:00
2022-08-01 15:45:58 -04:00
#[test]
fn verify() {
2024-02-14 21:28:51 -05:00
let typos_dict = parse_dict("assets/words.csv");
let new_dict = process(typos_dict);
2022-08-01 15:45:58 -04:00
let mut content = vec![];
let mut wtr = csv::WriterBuilder::new()
.flexible(true)
.from_writer(&mut content);
for (typo, corrections) in new_dict {
let mut row = vec![typo.as_str().to_owned()];
row.extend(corrections);
wtr.write_record(&row).unwrap();
}
wtr.flush().unwrap();
drop(wtr);
2022-08-01 15:45:58 -04:00
let content = String::from_utf8(content).unwrap();
2024-05-27 23:09:20 -04:00
snapbox::assert_data_eq!(content, snapbox::file!["../assets/words.csv"].raw());
2022-08-01 15:45:58 -04:00
}
fn parse_dict(path: &str) -> Vec<(String, Vec<String>)> {
let data = std::fs::read(path).unwrap();
let mut reader = csv::ReaderBuilder::new()
2021-07-27 14:15:12 -04:00
.has_headers(false)
.flexible(true)
.from_reader(&*data);
reader
2021-07-27 14:15:12 -04:00
.records()
.map(Result::unwrap)
.map(|record| {
let mut iter = record.into_iter();
let typo = iter.next().expect("typo");
(
typo.to_owned(),
iter.map(ToOwned::to_owned).collect::<Vec<_>>(),
)
})
.collect()
}
fn dict_from_iter<S: Into<String>>(
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
) -> Dict {
let mut dict = Dict::new();
for (typo, corrections) in iter {
let typo = UniCase::new(typo.into().to_ascii_lowercase());
// duplicate entries are merged
dict.entry(typo)
.or_default()
.extend(corrections.into_iter().map(|c| {
let mut c = c.into();
c.make_ascii_lowercase();
c
}));
}
dict
}
fn process<S: Into<String>>(
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
) -> Dict {
let dict = dict_from_iter(iter);
let rows: Dict = dict
2021-07-27 15:40:34 -04:00
.into_iter()
.filter(|(t, _)| is_word(t))
.map(|(t, c)| {
let new_c: IndexSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
(t, new_c)
2021-07-27 15:40:34 -04:00
})
.collect();
let varcon_words = varcon_words();
let allowed_words = allowed_words();
let word_variants = proper_word_variants();
2023-06-08 09:54:36 -04:00
let rows: Vec<_> = rows
2021-07-27 14:15:12 -04:00
.into_iter()
2021-07-27 15:09:51 -04:00
.filter(|(typo, _)| {
2024-05-02 12:59:32 -04:00
let is_disallowed = varcon_words.contains(&UniCase::new(typo));
2021-07-27 14:15:12 -04:00
if is_disallowed {
2024-07-26 17:08:02 -04:00
eprintln!("{typo:?} is disallowed; in varcon");
2021-07-27 14:15:12 -04:00
}
!is_disallowed
})
.filter(|(typo, _)| {
if let Some(reason) = allowed_words.get(typo.as_ref()) {
2024-07-26 17:08:02 -04:00
eprintln!("{typo:?} is disallowed; {reason}");
false
} else {
true
}
})
2021-07-27 15:09:51 -04:00
.map(|(typo, corrections)| {
let mut new_corrections = IndexSet::new();
2021-07-27 15:09:51 -04:00
for correction in corrections {
2021-07-27 14:15:12 -04:00
let correction = word_variants
.get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction);
2023-06-08 09:54:36 -04:00
new_corrections.insert(correction.to_owned());
2021-07-27 14:15:12 -04:00
}
2021-07-27 15:09:51 -04:00
(typo, new_corrections)
2021-07-27 14:15:12 -04:00
})
.collect();
2023-06-08 09:54:36 -04:00
let mut dict = Dict::new();
for (bad, good) in rows {
let current = dict.entry(bad).or_default();
current.extend(good);
}
2023-06-08 10:23:10 -04:00
let corrections: HashMap<_, _> = dict
.iter()
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
.collect();
dict.into_iter()
2023-06-08 10:23:10 -04:00
.filter(|(typo, _)| {
if let Some(correction) = corrections.get(typo.as_str()) {
eprintln!("{typo} <-> {correction} cycle detected");
false
} else {
true
}
})
.collect()
}
#[test]
fn test_preserve_correction_order() {
let dict = process([("foo", ["xyz", "abc"])]);
let mut corrections = dict.get(&UniCase::new("foo".into())).unwrap().iter();
assert_eq!(corrections.next().unwrap(), "xyz");
assert_eq!(corrections.next().unwrap(), "abc");
}
#[test]
fn test_merge_duplicates() {
assert_eq!(
process([("foo", ["bar"]), ("foo", ["baz"])]),
dict_from_iter([("foo", ["bar", "baz"])])
);
}
#[test]
fn test_duplicate_correction_removal() {
let dict = process([("foo", ["bar", "bar"])]);
assert_eq!(dict, dict_from_iter([("foo", ["bar"])]));
}
#[test]
fn test_cycle_removal() {
assert!(process([("foo", ["foobar"]), ("foobar", ["foo"])]).is_empty());
}
#[test]
fn test_varcon_removal() {
assert!(process([("colour", ["color"])]).is_empty());
}
#[test]
fn test_varcon_best_match() {
assert_eq!(
process([(
"neighourhood", // note the missing 'b'
["neighborhood"],
)]),
dict_from_iter([(
"neighourhood",
["neighbourhood"] // note that 'bor' has become 'bour' to match the typo
)])
);
}
2021-07-27 15:40:34 -04:00
fn is_word(word: &str) -> bool {
word.chars().all(|c| c.is_alphabetic())
}
2024-04-26 22:14:01 -04:00
fn varcon_words() -> HashSet<UniCase<&'static str>> {
// Even include improper ones because we should be letting varcon handle that rather than our
// dictionary
varcon::VARCON
.iter()
.filter(|c| c.verified)
.flat_map(|c| c.entries.iter())
.flat_map(|e| e.variants.iter())
2024-05-02 12:59:32 -04:00
.map(|v| UniCase::new(v.word))
.collect()
}
fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
for entry in varcon::VARCON
.iter()
.filter(|c| c.verified)
.flat_map(|c| c.entries.iter())
{
let variants: HashSet<_> = entry
.variants
.iter()
.filter(|v| v.types.iter().any(|t| t.tag != Some(varcon::Tag::Improper)))
.map(|v| v.word)
.collect();
for variant in variants.iter() {
2023-12-28 11:47:51 -05:00
let set = words.entry(variant).or_default();
set.extend(variants.iter().filter(|v| *v != variant));
}
}
words
}
fn find_best_match<'c>(
typo: &'c str,
correction: &'c str,
word_variants: &HashSet<&'static str>,
) -> Option<&'c str> {
assert!(!word_variants.contains(correction));
#[allow(clippy::single_match)]
match (typo, correction) {
// Picking the worst option due to a letter swap being an edit distance of two
("alinging", "aligning") | ("alingment", "alignment") | ("alingments", "alignments") => {
return None;
}
_ => {}
}
let current = edit_distance::edit_distance(typo, correction);
let mut matches: Vec<_> = word_variants
.iter()
.map(|r| (edit_distance::edit_distance(typo, r), *r))
.filter(|(d, _)| *d < current)
.collect();
matches.sort_unstable();
matches.into_iter().next().map(|(_, r)| r)
}
2024-04-26 22:14:01 -04:00
fn allowed_words() -> HashMap<String, String> {
let allowed_path = "assets/english.csv";
let english_data = std::fs::read(allowed_path).unwrap();
let mut allowed_english = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(english_data.as_slice());
let allowed_english = allowed_english.records().map(Result::unwrap).map(|r| {
let mut i = r.iter();
let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase();
(typo, String::from("english word"))
});
let allowed_path = "assets/allowed.csv";
let local_data = std::fs::read(allowed_path).unwrap();
let mut allowed_local = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(local_data.as_slice());
let allowed_local = allowed_local.records().map(Result::unwrap).map(|r| {
let mut i = r.iter();
let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase();
let reason = i.next().expect("reason").to_owned();
(typo, reason)
});
allowed_english.chain(allowed_local).collect()
}