refactor: Make dict processing logic testable

Previously all the dictionary cleanup logic was in the function: fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) which parsed the provided buffer as CSV and also took care of writing the processed dictionary back as CSV. This commit factors out the CSV handling, leaving a `process` function behind so that it can be easily tested in the following commit.
2024-12-23 08:02:15 -05:00 · 2023-06-24 11:14:40 +02:00 · 2023-06-24 11:14:40 +02:00 · 49a0eaab7b
commit 49a0eaab7b
parent 2fffb1bb2b
1 changed files with 62 additions and 33 deletions
--- a/crates/typos-dict/tests/verify.rs
+++ b/crates/typos-dict/tests/verify.rs
@ -9,38 +9,76 @@ type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;
 #[test]
 fn verify() {
    let asset_path = "assets/words.csv";
-    let data = std::fs::read(asset_path).unwrap();
+    let typos_dict = parse_dict(asset_path);
    let new_dict = process(typos_dict);
    let mut content = vec![];
-    generate(&mut content, &data);
+
    let mut wtr = csv::WriterBuilder::new()
        .flexible(true)
        .from_writer(&mut content);
    for (typo, corrections) in new_dict {
        let mut row = vec![typo.as_str().to_owned()];
        row.extend(corrections);
        wtr.write_record(&row).unwrap();
    }
    wtr.flush().unwrap();
    drop(wtr);
    let content = String::from_utf8(content).unwrap();
    snapbox::assert_eq_path(asset_path, content);
 }
-fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
+fn parse_dict(path: &str) -> Vec<(String, Vec<String>)> {
-    let mut rows = Dict::new();
+    let data = std::fs::read(path).unwrap();
-    csv::ReaderBuilder::new()
+
    let mut reader = csv::ReaderBuilder::new()
        .has_headers(false)
        .flexible(true)
-        .from_reader(dict)
+        .from_reader(&*data);
        .records()
        .map(Result::unwrap)
        .for_each(|r| {
            let mut i = r.iter();
            let mut typo = i.next().expect("typo").to_owned();
            typo.make_ascii_lowercase();
            let typo = UniCase::new(typo);
            rows.entry(typo)
                .or_insert_with(BTreeSet::new)
                .extend(i.map(|c| {
                    let mut c = c.to_owned();
                    c.make_ascii_lowercase();
                    c
                }));
        });
-    let rows: Dict = rows
+    reader
        .records()
        .into_iter()
        .map(Result::unwrap)
        .map(|record| {
            let mut iter = record.into_iter();
            let typo = iter.next().expect("typo");
            (
                typo.to_owned(),
                iter.map(ToOwned::to_owned).collect::<Vec<_>>(),
            )
        })
        .collect()
 }
 fn dict_from_iter<S: Into<String>>(
    iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
 ) -> Dict {
    let mut dict = Dict::new();
    for (typo, corrections) in iter {
        let typo = UniCase::new(typo.into().to_ascii_lowercase());
        // duplicate entries are merged
        dict.entry(typo)
            .or_insert_with(BTreeSet::new)
            .extend(corrections.into_iter().map(|c| {
                let mut c = c.into();
                c.make_ascii_lowercase();
                c
            }));
    }
    dict
 }
 fn process<S: Into<String>>(
    iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
 ) -> Dict {
    let dict = dict_from_iter(iter);
    let rows: Dict = dict
        .into_iter()
        .filter(|(t, _)| is_word(t))
        .filter_map(|(t, c)| {
@ -95,8 +133,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
        .iter()
        .flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
        .collect();
-    let rows: Vec<_> = dict
+    dict.into_iter()
        .into_iter()
        .filter(|(typo, _)| {
            if let Some(correction) = corrections.get(typo.as_str()) {
                eprintln!("{typo} <-> {correction} cycle detected");
@ -105,15 +142,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
                true
            }
        })
-        .collect();
+        .collect()
    let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
    for (typo, corrections) in rows {
        let mut row = vec![typo.as_str().to_owned()];
        row.extend(corrections);
        wtr.write_record(&row).unwrap();
    }
    wtr.flush().unwrap();
 }
 fn is_word(word: &str) -> bool {