mirror of
https://github.com/crate-ci/typos.git
synced 2024-12-23 08:02:15 -05:00
refactor: Make dict processing logic testable
Previously all the dictionary cleanup logic was in the function: fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) which parsed the provided buffer as CSV and also took care of writing the processed dictionary back as CSV. This commit factors out the CSV handling, leaving a `process` function behind so that it can be easily tested in the following commit.
This commit is contained in:
parent
2fffb1bb2b
commit
49a0eaab7b
1 changed files with 62 additions and 33 deletions
|
@ -9,38 +9,76 @@ type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;
|
||||||
#[test]
|
#[test]
|
||||||
fn verify() {
|
fn verify() {
|
||||||
let asset_path = "assets/words.csv";
|
let asset_path = "assets/words.csv";
|
||||||
let data = std::fs::read(asset_path).unwrap();
|
let typos_dict = parse_dict(asset_path);
|
||||||
|
let new_dict = process(typos_dict);
|
||||||
|
|
||||||
let mut content = vec![];
|
let mut content = vec![];
|
||||||
generate(&mut content, &data);
|
|
||||||
|
let mut wtr = csv::WriterBuilder::new()
|
||||||
|
.flexible(true)
|
||||||
|
.from_writer(&mut content);
|
||||||
|
for (typo, corrections) in new_dict {
|
||||||
|
let mut row = vec![typo.as_str().to_owned()];
|
||||||
|
row.extend(corrections);
|
||||||
|
wtr.write_record(&row).unwrap();
|
||||||
|
}
|
||||||
|
wtr.flush().unwrap();
|
||||||
|
drop(wtr);
|
||||||
|
|
||||||
let content = String::from_utf8(content).unwrap();
|
let content = String::from_utf8(content).unwrap();
|
||||||
snapbox::assert_eq_path(asset_path, content);
|
snapbox::assert_eq_path(asset_path, content);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
fn parse_dict(path: &str) -> Vec<(String, Vec<String>)> {
|
||||||
let mut rows = Dict::new();
|
let data = std::fs::read(path).unwrap();
|
||||||
csv::ReaderBuilder::new()
|
|
||||||
|
let mut reader = csv::ReaderBuilder::new()
|
||||||
.has_headers(false)
|
.has_headers(false)
|
||||||
.flexible(true)
|
.flexible(true)
|
||||||
.from_reader(dict)
|
.from_reader(&*data);
|
||||||
.records()
|
|
||||||
.map(Result::unwrap)
|
|
||||||
.for_each(|r| {
|
|
||||||
let mut i = r.iter();
|
|
||||||
let mut typo = i.next().expect("typo").to_owned();
|
|
||||||
typo.make_ascii_lowercase();
|
|
||||||
let typo = UniCase::new(typo);
|
|
||||||
rows.entry(typo)
|
|
||||||
.or_insert_with(BTreeSet::new)
|
|
||||||
.extend(i.map(|c| {
|
|
||||||
let mut c = c.to_owned();
|
|
||||||
c.make_ascii_lowercase();
|
|
||||||
c
|
|
||||||
}));
|
|
||||||
});
|
|
||||||
|
|
||||||
let rows: Dict = rows
|
reader
|
||||||
|
.records()
|
||||||
|
.into_iter()
|
||||||
|
.map(Result::unwrap)
|
||||||
|
.map(|record| {
|
||||||
|
let mut iter = record.into_iter();
|
||||||
|
let typo = iter.next().expect("typo");
|
||||||
|
(
|
||||||
|
typo.to_owned(),
|
||||||
|
iter.map(ToOwned::to_owned).collect::<Vec<_>>(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dict_from_iter<S: Into<String>>(
|
||||||
|
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
|
||||||
|
) -> Dict {
|
||||||
|
let mut dict = Dict::new();
|
||||||
|
|
||||||
|
for (typo, corrections) in iter {
|
||||||
|
let typo = UniCase::new(typo.into().to_ascii_lowercase());
|
||||||
|
|
||||||
|
// duplicate entries are merged
|
||||||
|
dict.entry(typo)
|
||||||
|
.or_insert_with(BTreeSet::new)
|
||||||
|
.extend(corrections.into_iter().map(|c| {
|
||||||
|
let mut c = c.into();
|
||||||
|
c.make_ascii_lowercase();
|
||||||
|
c
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
dict
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process<S: Into<String>>(
|
||||||
|
iter: impl IntoIterator<Item = (S, impl IntoIterator<Item = S>)>,
|
||||||
|
) -> Dict {
|
||||||
|
let dict = dict_from_iter(iter);
|
||||||
|
|
||||||
|
let rows: Dict = dict
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|(t, _)| is_word(t))
|
.filter(|(t, _)| is_word(t))
|
||||||
.filter_map(|(t, c)| {
|
.filter_map(|(t, c)| {
|
||||||
|
@ -95,8 +133,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||||
.iter()
|
.iter()
|
||||||
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
|
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
|
||||||
.collect();
|
.collect();
|
||||||
let rows: Vec<_> = dict
|
dict.into_iter()
|
||||||
.into_iter()
|
|
||||||
.filter(|(typo, _)| {
|
.filter(|(typo, _)| {
|
||||||
if let Some(correction) = corrections.get(typo.as_str()) {
|
if let Some(correction) = corrections.get(typo.as_str()) {
|
||||||
eprintln!("{typo} <-> {correction} cycle detected");
|
eprintln!("{typo} <-> {correction} cycle detected");
|
||||||
|
@ -105,15 +142,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect()
|
||||||
|
|
||||||
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
|
|
||||||
for (typo, corrections) in rows {
|
|
||||||
let mut row = vec![typo.as_str().to_owned()];
|
|
||||||
row.extend(corrections);
|
|
||||||
wtr.write_record(&row).unwrap();
|
|
||||||
}
|
|
||||||
wtr.flush().unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_word(word: &str) -> bool {
|
fn is_word(word: &str) -> bool {
|
||||||
|
|
Loading…
Reference in a new issue