diff --git a/crates/typos-dict/tests/verify.rs b/crates/typos-dict/tests/verify.rs index 2b6b2e1..b824a85 100644 --- a/crates/typos-dict/tests/verify.rs +++ b/crates/typos-dict/tests/verify.rs @@ -9,38 +9,76 @@ type Dict = BTreeMap, BTreeSet>; #[test] fn verify() { let asset_path = "assets/words.csv"; - let data = std::fs::read(asset_path).unwrap(); + let typos_dict = parse_dict(asset_path); + let new_dict = process(typos_dict); let mut content = vec![]; - generate(&mut content, &data); + + let mut wtr = csv::WriterBuilder::new() + .flexible(true) + .from_writer(&mut content); + for (typo, corrections) in new_dict { + let mut row = vec![typo.as_str().to_owned()]; + row.extend(corrections); + wtr.write_record(&row).unwrap(); + } + wtr.flush().unwrap(); + drop(wtr); let content = String::from_utf8(content).unwrap(); snapbox::assert_eq_path(asset_path, content); } -fn generate(file: &mut W, dict: &[u8]) { - let mut rows = Dict::new(); - csv::ReaderBuilder::new() +fn parse_dict(path: &str) -> Vec<(String, Vec)> { + let data = std::fs::read(path).unwrap(); + + let mut reader = csv::ReaderBuilder::new() .has_headers(false) .flexible(true) - .from_reader(dict) - .records() - .map(Result::unwrap) - .for_each(|r| { - let mut i = r.iter(); - let mut typo = i.next().expect("typo").to_owned(); - typo.make_ascii_lowercase(); - let typo = UniCase::new(typo); - rows.entry(typo) - .or_insert_with(BTreeSet::new) - .extend(i.map(|c| { - let mut c = c.to_owned(); - c.make_ascii_lowercase(); - c - })); - }); + .from_reader(&*data); - let rows: Dict = rows + reader + .records() + .into_iter() + .map(Result::unwrap) + .map(|record| { + let mut iter = record.into_iter(); + let typo = iter.next().expect("typo"); + ( + typo.to_owned(), + iter.map(ToOwned::to_owned).collect::>(), + ) + }) + .collect() +} + +fn dict_from_iter>( + iter: impl IntoIterator)>, +) -> Dict { + let mut dict = Dict::new(); + + for (typo, corrections) in iter { + let typo = UniCase::new(typo.into().to_ascii_lowercase()); + + // duplicate entries are merged + dict.entry(typo) + .or_insert_with(BTreeSet::new) + .extend(corrections.into_iter().map(|c| { + let mut c = c.into(); + c.make_ascii_lowercase(); + c + })); + } + + dict +} + +fn process>( + iter: impl IntoIterator)>, +) -> Dict { + let dict = dict_from_iter(iter); + + let rows: Dict = dict .into_iter() .filter(|(t, _)| is_word(t)) .filter_map(|(t, c)| { @@ -95,8 +133,7 @@ fn generate(file: &mut W, dict: &[u8]) { .iter() .flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned()))) .collect(); - let rows: Vec<_> = dict - .into_iter() + dict.into_iter() .filter(|(typo, _)| { if let Some(correction) = corrections.get(typo.as_str()) { eprintln!("{typo} <-> {correction} cycle detected"); @@ -105,15 +142,45 @@ fn generate(file: &mut W, dict: &[u8]) { true } }) - .collect(); + .collect() +} - let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file); - for (typo, corrections) in rows { - let mut row = vec![typo.as_str().to_owned()]; - row.extend(corrections); - wtr.write_record(&row).unwrap(); - } - wtr.flush().unwrap(); +#[test] +fn test_merge_duplicates() { + assert_eq!( + process([("foo", ["bar"]), ("foo", ["baz"])]), + dict_from_iter([("foo", ["bar", "baz"])]) + ); +} + +#[test] +fn test_duplicate_correction_removal() { + let dict = process([("foo", ["bar", "bar"])]); + assert_eq!(dict, dict_from_iter([("foo", ["bar"])])); +} + +#[test] +fn test_cycle_removal() { + assert!(process([("foo", ["bar"]), ("bar", ["foo"])]).is_empty()); +} + +#[test] +fn test_varcon_removal() { + assert!(process([("colour", ["color"])]).is_empty()); +} + +#[test] +fn test_varcon_best_match() { + assert_eq!( + process([( + "neighourhood", // note the missing 'b' + ["neighborhood"], + )]), + dict_from_iter([( + "neighourhood", + ["neighbourhood"] // note that 'bor' has become 'bour' to match the typo + )]) + ); } fn is_word(word: &str) -> bool {