refactor: Make dict processing logic testable

Previously all the dictionary cleanup logic was in the function:

    fn generate<W: std::io::Write>(file: &mut W, dict: &[u8])

which parsed the provided buffer as CSV and also took care of writing
the processed dictionary back as CSV.  This commit factors out the CSV
handling, leaving a `process` function behind so that it can be easily
tested in the following commit.
This commit is contained in:
Martin Fischer 2023-06-24 11:14:40 +02:00
parent 2fffb1bb2b
commit 49a0eaab7b

View file

@ -9,38 +9,76 @@ type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;
#[test] #[test]
fn verify() { fn verify() {
let asset_path = "assets/words.csv"; let asset_path = "assets/words.csv";
let data = std::fs::read(asset_path).unwrap(); let typos_dict = parse_dict(asset_path);
let new_dict = process(typos_dict);
let mut content = vec![]; let mut content = vec![];
generate(&mut content, &data);
let mut wtr = csv::WriterBuilder::new()
.flexible(true)
.from_writer(&mut content);
for (typo, corrections) in new_dict {
let mut row = vec![typo.as_str().to_owned()];
row.extend(corrections);
wtr.write_record(&row).unwrap();
}
wtr.flush().unwrap();
drop(wtr);
let content = String::from_utf8(content).unwrap(); let content = String::from_utf8(content).unwrap();
snapbox::assert_eq_path(asset_path, content); snapbox::assert_eq_path(asset_path, content);
} }
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) { fn parse_dict(path: &str) -> Vec<(String, Vec<String>)> {
let mut rows = Dict::new(); let data = std::fs::read(path).unwrap();
csv::ReaderBuilder::new()
let mut reader = csv::ReaderBuilder::new()
.has_headers(false) .has_headers(false)
.flexible(true) .flexible(true)
.from_reader(dict) .from_reader(&*data);
.records()
.map(Result::unwrap)
.for_each(|r| {
let mut i = r.iter();
let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase();
let typo = UniCase::new(typo);
rows.entry(typo)
.or_insert_with(BTreeSet::new)
.extend(i.map(|c| {
let mut c = c.to_owned();
c.make_ascii_lowercase();
c
}));
});
let rows: Dict = rows reader
.records()
.into_iter()
.map(Result::unwrap)
.map(|record| {
let mut iter = record.into_iter();
let typo = iter.next().expect("typo");
(
typo.to_owned(),
iter.map(ToOwned::to_owned).collect::<Vec<_>>(),
)
})
.collect()
}
fn dict_from_iter<S: Into<String>>(
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
) -> Dict {
let mut dict = Dict::new();
for (typo, corrections) in iter {
let typo = UniCase::new(typo.into().to_ascii_lowercase());
// duplicate entries are merged
dict.entry(typo)
.or_insert_with(BTreeSet::new)
.extend(corrections.into_iter().map(|c| {
let mut c = c.into();
c.make_ascii_lowercase();
c
}));
}
dict
}
fn process<S: Into<String>>(
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
) -> Dict {
let dict = dict_from_iter(iter);
let rows: Dict = dict
.into_iter() .into_iter()
.filter(|(t, _)| is_word(t)) .filter(|(t, _)| is_word(t))
.filter_map(|(t, c)| { .filter_map(|(t, c)| {
@ -95,8 +133,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
.iter() .iter()
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned()))) .flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
.collect(); .collect();
let rows: Vec<_> = dict dict.into_iter()
.into_iter()
.filter(|(typo, _)| { .filter(|(typo, _)| {
if let Some(correction) = corrections.get(typo.as_str()) { if let Some(correction) = corrections.get(typo.as_str()) {
eprintln!("{typo} <-> {correction} cycle detected"); eprintln!("{typo} <-> {correction} cycle detected");
@ -105,15 +142,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
true true
} }
}) })
.collect(); .collect()
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
for (typo, corrections) in rows {
let mut row = vec![typo.as_str().to_owned()];
row.extend(corrections);
wtr.write_record(&row).unwrap();
}
wtr.flush().unwrap();
} }
fn is_word(word: &str) -> bool { fn is_word(word: &str) -> bool {