typos/crates/typos-dict/tests/verify.rs

use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::collections::HashSet;
use unicase::UniCase;

type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;

#[test]
fn verify() {
    let asset_path = "assets/words.csv";
    let data = std::fs::read(asset_path).unwrap();

    let mut content = vec![];
    generate(&mut content, &data);

    let content = String::from_utf8(content).unwrap();
    snapbox::assert_eq_path(asset_path, content);
}

fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
    let mut rows = Dict::new();
    csv::ReaderBuilder::new()
        .has_headers(false)
        .flexible(true)
        .from_reader(dict)
        .records()
        .map(Result::unwrap)
        .for_each(|r| {
            let mut i = r.iter();
            let mut typo = i.next().expect("typo").to_owned();
            typo.make_ascii_lowercase();
            let typo = UniCase::new(typo);
            rows.entry(typo)
                .or_insert_with(BTreeSet::new)
                .extend(i.map(|c| {
                    let mut c = c.to_owned();
                    c.make_ascii_lowercase();
                    c
                }));
        });

    let rows: Dict = rows
        .into_iter()
        .filter(|(t, _)| is_word(t))
        .filter_map(|(t, c)| {
            let new_c: BTreeSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
            if new_c.is_empty() {
                None
            } else {
                Some((t, new_c))
            }
        })
        .collect();

    let varcon_words = varcon_words();
    let allowed_words = allowed_words();
    let word_variants = proper_word_variants();
    let rows: Vec<_> = rows
        .into_iter()
        .filter(|(typo, _)| {
            let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo));
            if is_disallowed {
                eprintln!("{:?} is disallowed; in varcon", typo);
            }
            !is_disallowed
        })
        .filter(|(typo, _)| {
            if let Some(reason) = allowed_words.get(typo.as_ref()) {
                eprintln!("{:?} is disallowed; {}", typo, reason);
                false
            } else {
                true
            }
        })
        .map(|(typo, corrections)| {
            let mut new_corrections = BTreeSet::new();
            for correction in corrections {
                let correction = word_variants
                    .get(correction.as_str())
                    .and_then(|words| find_best_match(&typo, correction.as_str(), words))
                    .unwrap_or(&correction);
                new_corrections.insert(correction.to_owned());
            }
            (typo, new_corrections)
        })
        .collect();
    let mut dict = Dict::new();
    for (bad, good) in rows {
        let current = dict.entry(bad).or_default();
        current.extend(good);
    }

    let corrections: HashMap<_, _> = dict
        .iter()
        .flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
        .collect();
    let rows: Vec<_> = dict
        .into_iter()
        .filter(|(typo, _)| {
            if let Some(correction) = corrections.get(typo.as_str()) {
                eprintln!("{typo} <-> {correction} cycle detected");
                false
            } else {
                true
            }
        })
        .collect();

    let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
    for (typo, corrections) in rows {
        let mut row = vec![typo.as_str().to_owned()];
        row.extend(corrections);
        wtr.write_record(&row).unwrap();
    }
    wtr.flush().unwrap();
}

fn is_word(word: &str) -> bool {
    word.chars().all(|c| c.is_alphabetic())
}

fn varcon_words() -> HashSet<unicase::UniCase<&'static str>> {
    // Even include improper ones because we should be letting varcon handle that rather than our
    // dictionary
    varcon::VARCON
        .iter()
        .flat_map(|c| c.entries.iter())
        .flat_map(|e| e.variants.iter())
        .map(|v| unicase::UniCase::new(v.word))
        .collect()
}

fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
    let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
    for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
        let variants: HashSet<_> = entry
            .variants
            .iter()
            .filter(|v| v.types.iter().any(|t| t.tag != Some(varcon::Tag::Improper)))
            .map(|v| v.word)
            .collect();
        for variant in variants.iter() {
            let set = words.entry(variant).or_insert_with(HashSet::new);
            set.extend(variants.iter().filter(|v| *v != variant));
        }
    }
    words
}

fn find_best_match<'c>(
    typo: &'c str,
    correction: &'c str,
    word_variants: &HashSet<&'static str>,
) -> Option<&'c str> {
    assert!(!word_variants.contains(correction));
    let current = edit_distance::edit_distance(typo, correction);
    let mut matches: Vec<_> = word_variants
        .iter()
        .map(|r| (edit_distance::edit_distance(typo, r), *r))
        .filter(|(d, _)| *d < current)
        .collect();
    matches.sort_unstable();
    matches.into_iter().next().map(|(_, r)| r)
}

fn allowed_words() -> std::collections::HashMap<String, String> {
    let allowed_path = "assets/allowed.csv";
    let data = std::fs::read(allowed_path).unwrap();
    csv::ReaderBuilder::new()
        .has_headers(false)
        .flexible(true)
        .from_reader(data.as_slice())
        .records()
        .map(Result::unwrap)
        .map(|r| {
            let mut i = r.iter();
            let mut typo = i.next().expect("typo").to_owned();
            typo.make_ascii_lowercase();
            let reason = i.next().expect("reason").to_owned();
            (typo, reason)
        })
        .collect()
}
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`use std::collections::BTreeMap;`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`use std::collections::BTreeSet;`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`use std::collections::HashMap;`
			`use std::collections::HashSet;`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`use unicase::UniCase;`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00
test: Move codegen to tests 2022-08-01 15:45:58 -04:00			`#[test]`
			`fn verify() {`
			`let asset_path = "assets/words.csv";`
			`let data = std::fs::read(asset_path).unwrap();`

			`let mut content = vec![];`
			`generate(&mut content, &data);`

			`let content = String::from_utf8(content).unwrap();`
			`snapbox::assert_eq_path(asset_path, content);`
			`}`

feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`let mut rows = Dict::new();`
			`csv::ReaderBuilder::new()`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`.has_headers(false)`
			`.flexible(true)`
			`.from_reader(dict)`
			`.records()`
			`.map(Result::unwrap)`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`.for_each(\|r\| {`
			`let mut i = r.iter();`
test: Ensure words are stored lowercase 2021-07-27 15:11:52 -04:00			`let mut typo = i.next().expect("typo").to_owned();`
			`typo.make_ascii_lowercase();`
			`let typo = UniCase::new(typo);`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`rows.entry(typo)`
			`.or_insert_with(BTreeSet::new)`
			`.extend(i.map(\|c\| {`
			`let mut c = c.to_owned();`
			`c.make_ascii_lowercase();`
			`c`
			`}));`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`});`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00
feat(dict): Add more corrections 2021-07-27 15:40:34 -04:00			`let rows: Dict = rows`
			`.into_iter()`
			`.filter(\|(t, _)\| is_word(t))`
			`.filter_map(\|(t, c)\| {`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`let new_c: BTreeSet<_> = c.into_iter().filter(\|c\| is_word(c)).collect();`
feat(dict): Add more corrections 2021-07-27 15:40:34 -04:00			`if new_c.is_empty() {`
			`None`
			`} else {`
			`Some((t, new_c))`
			`}`
			`})`
			`.collect();`

fix(dict): Remove nilable See conversation in #613 2022-12-06 11:47:08 -05:00			`let varcon_words = varcon_words();`
			`let allowed_words = allowed_words();`
refactor(varcon): Clarify check's meanings 2021-05-15 20:29:27 -04:00			`let word_variants = proper_word_variants();`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`let rows: Vec<_> = rows`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`.into_iter()`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`.filter(\|(typo, _)\| {`
fix(dict): Remove nilable See conversation in #613 2022-12-06 11:47:08 -05:00			`let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo));`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`if is_disallowed {`
fix(dict): Remove nilable See conversation in #613 2022-12-06 11:47:08 -05:00			`eprintln!("{:?} is disallowed; in varcon", typo);`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`}`
			`!is_disallowed`
			`})`
fix(dict): Remove nilable See conversation in #613 2022-12-06 11:47:08 -05:00			`.filter(\|(typo, _)\| {`
			`if let Some(reason) = allowed_words.get(typo.as_ref()) {`
			`eprintln!("{:?} is disallowed; {}", typo, reason);`
			`false`
			`} else {`
			`true`
			`}`
			`})`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`.map(\|(typo, corrections)\| {`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`let mut new_corrections = BTreeSet::new();`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`for correction in corrections {`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`let correction = word_variants`
			`.get(correction.as_str())`
			`.and_then(\|words\| find_best_match(&typo, correction.as_str(), words))`
			`.unwrap_or(&correction);`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`new_corrections.insert(correction.to_owned());`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`}`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`(typo, new_corrections)`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`})`
			`.collect();`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`let mut dict = Dict::new();`
			`for (bad, good) in rows {`
			`let current = dict.entry(bad).or_default();`
			`current.extend(good);`
			`}`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00
test(dict): Report more cases to user 2023-06-08 10:23:10 -04:00			`let corrections: HashMap<_, _> = dict`
			`.iter()`
			`.flat_map(\|(bad, good)\| good.iter().map(\|good\| (good.to_owned(), bad.to_owned())))`
			`.collect();`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`let rows: Vec<_> = dict`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`.into_iter()`
test(dict): Report more cases to user 2023-06-08 10:23:10 -04:00			`.filter(\|(typo, _)\| {`
			`if let Some(correction) = corrections.get(typo.as_str()) {`
			`eprintln!("{typo} <-> {correction} cycle detected");`
			`false`
			`} else {`
			`true`
			`}`
			`})`
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`.collect();`
refactor(dict): Allow 0..n corrections in BuiltIn The main use case is taking `ther` -> `there` and adding `the` and `their`. 2021-05-15 20:06:04 -04:00
test: Prevent correcting corrections 2021-07-27 14:15:12 -04:00			`let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);`
test: Ensure words.csv stays sorted 2021-07-27 15:09:51 -04:00			`for (typo, corrections) in rows {`
chore(dict): Automate more cleanup 2023-06-08 09:54:36 -04:00			`let mut row = vec![typo.as_str().to_owned()];`
			`row.extend(corrections);`
refactor(dict): Allow 0..n corrections in BuiltIn The main use case is taking `ther` -> `there` and adding `the` and `their`. 2021-05-15 20:06:04 -04:00			`wtr.write_record(&row).unwrap();`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`}`
			`wtr.flush().unwrap();`
			`}`

feat(dict): Add more corrections 2021-07-27 15:40:34 -04:00			`fn is_word(word: &str) -> bool {`
			`word.chars().all(\|c\| c.is_alphabetic())`
			`}`

refactor(varcon): Clarify check's meanings 2021-05-15 20:29:27 -04:00			`fn varcon_words() -> HashSet<unicase::UniCase<&'static str>> {`
			`// Even include improper ones because we should be letting varcon handle that rather than our`
			`// dictionary`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`varcon::VARCON`
			`.iter()`
			`.flat_map(\|c\| c.entries.iter())`
			`.flat_map(\|e\| e.variants.iter())`
			`.map(\|v\| unicase::UniCase::new(v.word))`
			`.collect()`
			`}`

refactor(varcon): Clarify check's meanings 2021-05-15 20:29:27 -04:00			`fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();`
			`for entry in varcon::VARCON.iter().flat_map(\|c\| c.entries.iter()) {`
			`let variants: HashSet<_> = entry`
			`.variants`
			`.iter()`
			`.filter(\|v\| v.types.iter().any(\|t\| t.tag != Some(varcon::Tag::Improper)))`
			`.map(\|v\| v.word)`
			`.collect();`
			`for variant in variants.iter() {`
			`let set = words.entry(variant).or_insert_with(HashSet::new);`
			`set.extend(variants.iter().filter(\|v\| *v != variant));`
			`}`
			`}`
			`words`
			`}`

			`fn find_best_match<'c>(`
			`typo: &'c str,`
			`correction: &'c str,`
refactor(varcon): Clarify check's meanings 2021-05-15 20:29:27 -04:00			`word_variants: &HashSet<&'static str>,`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`) -> Option<&'c str> {`
refactor(varcon): Clarify check's meanings 2021-05-15 20:29:27 -04:00			`assert!(!word_variants.contains(correction));`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`let current = edit_distance::edit_distance(typo, correction);`
refactor(varcon): Clarify check's meanings 2021-05-15 20:29:27 -04:00			`let mut matches: Vec<_> = word_variants`
feat: Support english dialects The goal is to be as accepting and unobtrusive to new code bases as possible. To this end, we correct typos into the closest english dialect. If someone wants to opt-in, they can have typos correct to a specific english dialect. Fixes #52 Fixes #22 2020-05-27 21:46:41 -04:00			`.iter()`
			`.map(\|r\| (edit_distance::edit_distance(typo, r), *r))`
			`.filter(\|(d, _)\| *d < current)`
			`.collect();`
			`matches.sort_unstable();`
			`matches.into_iter().next().map(\|(_, r)\| r)`
			`}`
fix(dict): Remove nilable See conversation in #613 2022-12-06 11:47:08 -05:00
			`fn allowed_words() -> std::collections::HashMap<String, String> {`
			`let allowed_path = "assets/allowed.csv";`
			`let data = std::fs::read(allowed_path).unwrap();`
			`csv::ReaderBuilder::new()`
			`.has_headers(false)`
			`.flexible(true)`
			`.from_reader(data.as_slice())`
			`.records()`
			`.map(Result::unwrap)`
			`.map(\|r\| {`
			`let mut i = r.iter();`
			`let mut typo = i.next().expect("typo").to_owned();`
			`typo.make_ascii_lowercase();`
			`let reason = i.next().expect("reason").to_owned();`
			`(typo, reason)`
			`})`
			`.collect()`
			`}`