From 49a0eaab7b91ff29b3354997e80cf4c388961523 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sat, 24 Jun 2023 11:14:40 +0200 Subject: [PATCH 1/2] refactor: Make dict processing logic testable Previously all the dictionary cleanup logic was in the function: fn generate(file: &mut W, dict: &[u8]) which parsed the provided buffer as CSV and also took care of writing the processed dictionary back as CSV. This commit factors out the CSV handling, leaving a `process` function behind so that it can be easily tested in the following commit. --- crates/typos-dict/tests/verify.rs | 95 ++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/crates/typos-dict/tests/verify.rs b/crates/typos-dict/tests/verify.rs index 2b6b2e1..abf4880 100644 --- a/crates/typos-dict/tests/verify.rs +++ b/crates/typos-dict/tests/verify.rs @@ -9,38 +9,76 @@ type Dict = BTreeMap, BTreeSet>; #[test] fn verify() { let asset_path = "assets/words.csv"; - let data = std::fs::read(asset_path).unwrap(); + let typos_dict = parse_dict(asset_path); + let new_dict = process(typos_dict); let mut content = vec![]; - generate(&mut content, &data); + + let mut wtr = csv::WriterBuilder::new() + .flexible(true) + .from_writer(&mut content); + for (typo, corrections) in new_dict { + let mut row = vec![typo.as_str().to_owned()]; + row.extend(corrections); + wtr.write_record(&row).unwrap(); + } + wtr.flush().unwrap(); + drop(wtr); let content = String::from_utf8(content).unwrap(); snapbox::assert_eq_path(asset_path, content); } -fn generate(file: &mut W, dict: &[u8]) { - let mut rows = Dict::new(); - csv::ReaderBuilder::new() +fn parse_dict(path: &str) -> Vec<(String, Vec)> { + let data = std::fs::read(path).unwrap(); + + let mut reader = csv::ReaderBuilder::new() .has_headers(false) .flexible(true) - .from_reader(dict) - .records() - .map(Result::unwrap) - .for_each(|r| { - let mut i = r.iter(); - let mut typo = i.next().expect("typo").to_owned(); - typo.make_ascii_lowercase(); - let typo = UniCase::new(typo); - rows.entry(typo) - .or_insert_with(BTreeSet::new) - .extend(i.map(|c| { - let mut c = c.to_owned(); - c.make_ascii_lowercase(); - c - })); - }); + .from_reader(&*data); - let rows: Dict = rows + reader + .records() + .into_iter() + .map(Result::unwrap) + .map(|record| { + let mut iter = record.into_iter(); + let typo = iter.next().expect("typo"); + ( + typo.to_owned(), + iter.map(ToOwned::to_owned).collect::>(), + ) + }) + .collect() +} + +fn dict_from_iter>( + iter: impl IntoIterator)>, +) -> Dict { + let mut dict = Dict::new(); + + for (typo, corrections) in iter { + let typo = UniCase::new(typo.into().to_ascii_lowercase()); + + // duplicate entries are merged + dict.entry(typo) + .or_insert_with(BTreeSet::new) + .extend(corrections.into_iter().map(|c| { + let mut c = c.into(); + c.make_ascii_lowercase(); + c + })); + } + + dict +} + +fn process>( + iter: impl IntoIterator)>, +) -> Dict { + let dict = dict_from_iter(iter); + + let rows: Dict = dict .into_iter() .filter(|(t, _)| is_word(t)) .filter_map(|(t, c)| { @@ -95,8 +133,7 @@ fn generate(file: &mut W, dict: &[u8]) { .iter() .flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned()))) .collect(); - let rows: Vec<_> = dict - .into_iter() + dict.into_iter() .filter(|(typo, _)| { if let Some(correction) = corrections.get(typo.as_str()) { eprintln!("{typo} <-> {correction} cycle detected"); @@ -105,15 +142,7 @@ fn generate(file: &mut W, dict: &[u8]) { true } }) - .collect(); - - let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file); - for (typo, corrections) in rows { - let mut row = vec![typo.as_str().to_owned()]; - row.extend(corrections); - wtr.write_record(&row).unwrap(); - } - wtr.flush().unwrap(); + .collect() } fn is_word(word: &str) -> bool { From 89d5a97a8a2572edb1094442a6f73462b46b4ad8 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Sat, 24 Jun 2023 12:21:29 +0200 Subject: [PATCH 2/2] test: Add some tests for dict processing logic --- crates/typos-dict/tests/verify.rs | 38 +++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/crates/typos-dict/tests/verify.rs b/crates/typos-dict/tests/verify.rs index abf4880..b824a85 100644 --- a/crates/typos-dict/tests/verify.rs +++ b/crates/typos-dict/tests/verify.rs @@ -145,6 +145,44 @@ fn process>( .collect() } +#[test] +fn test_merge_duplicates() { + assert_eq!( + process([("foo", ["bar"]), ("foo", ["baz"])]), + dict_from_iter([("foo", ["bar", "baz"])]) + ); +} + +#[test] +fn test_duplicate_correction_removal() { + let dict = process([("foo", ["bar", "bar"])]); + assert_eq!(dict, dict_from_iter([("foo", ["bar"])])); +} + +#[test] +fn test_cycle_removal() { + assert!(process([("foo", ["bar"]), ("bar", ["foo"])]).is_empty()); +} + +#[test] +fn test_varcon_removal() { + assert!(process([("colour", ["color"])]).is_empty()); +} + +#[test] +fn test_varcon_best_match() { + assert_eq!( + process([( + "neighourhood", // note the missing 'b' + ["neighborhood"], + )]), + dict_from_iter([( + "neighourhood", + ["neighbourhood"] // note that 'bor' has become 'bour' to match the typo + )]) + ); +} + fn is_word(word: &str) -> bool { word.chars().all(|c| c.is_alphabetic()) }