Merge pull request #251 from epage/vars

fix(dict): Correctly connect dict with varcon
This commit is contained in:
Ed Page 2021-05-17 21:51:13 -05:00 committed by GitHub
commit 444d2cca91
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 87 additions and 10 deletions

View file

@ -6,8 +6,8 @@ use structopt::StructOpt;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) { fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::Writer::from_writer(file); let mut wtr = csv::Writer::from_writer(file);
let disallowed_typos = disallowed_typos(); let disallowed_typos = varcon_words();
let related_words = related_words(); let word_variants = proper_word_variants();
let mut reader = csv::ReaderBuilder::new() let mut reader = csv::ReaderBuilder::new()
.has_headers(false) .has_headers(false)
@ -19,7 +19,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
if disallowed_typos.contains(&unicase::UniCase::new(typo)) { if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue; continue;
} }
let correction = related_words let correction = word_variants
.get(correction) .get(correction)
.and_then(|words| find_best_match(typo, correction, words)) .and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction); .unwrap_or(correction);
@ -28,7 +28,9 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
wtr.flush().unwrap(); wtr.flush().unwrap();
} }
fn disallowed_typos() -> HashSet<unicase::UniCase<&'static str>> { fn varcon_words() -> HashSet<unicase::UniCase<&'static str>> {
// Even include improper ones because we should be letting varcon handle that rather than our
// dictionary
varcon::VARCON varcon::VARCON
.iter() .iter()
.flat_map(|c| c.entries.iter()) .flat_map(|c| c.entries.iter())
@ -37,7 +39,7 @@ fn disallowed_typos() -> HashSet<unicase::UniCase<&'static str>> {
.collect() .collect()
} }
fn related_words() -> HashMap<&'static str, HashSet<&'static str>> { fn proper_word_variants() -> HashMap<&'static str, HashSet<&'static str>> {
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new(); let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) { for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
let variants: HashSet<_> = entry let variants: HashSet<_> = entry
@ -57,11 +59,11 @@ fn related_words() -> HashMap<&'static str, HashSet<&'static str>> {
fn find_best_match<'c>( fn find_best_match<'c>(
typo: &'c str, typo: &'c str,
correction: &'c str, correction: &'c str,
related_words: &HashSet<&'static str>, word_variants: &HashSet<&'static str>,
) -> Option<&'c str> { ) -> Option<&'c str> {
assert!(!related_words.contains(correction)); assert!(!word_variants.contains(correction));
let current = edit_distance::edit_distance(typo, correction); let current = edit_distance::edit_distance(typo, correction);
let mut matches: Vec<_> = related_words let mut matches: Vec<_> = word_variants
.iter() .iter()
.map(|r| (edit_distance::edit_distance(typo, r), *r)) .map(|r| (edit_distance::edit_distance(typo, r), *r))
.filter(|(d, _)| *d < current) .filter(|(d, _)| *d < current)

View file

@ -35,8 +35,14 @@ impl BuiltIn {
let word = word_token.token(); let word = word_token.token();
let mut corrections = if let Some(correction) = self.correct_with_dict(word) { let mut corrections = if let Some(correction) = self.correct_with_dict(word) {
self.correct_with_vars(word) match self.correct_with_vars(correction) {
.unwrap_or_else(|| Status::Corrections(vec![Cow::Borrowed(correction)])) Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]),
Some(correction @ Status::Corrections(_)) => correction,
Some(Status::Invalid) => {
unreachable!("correct_with_vars should always have valid suggestions")
}
None => Status::Corrections(vec![Cow::Borrowed(correction)]),
}
} else { } else {
self.correct_with_vars(word)? self.correct_with_vars(word)?
}; };
@ -244,6 +250,75 @@ impl<'i, 'w, D: typos::Dictionary> typos::Dictionary for Override<'i, 'w, D> {
mod test { mod test {
use super::*; use super::*;
#[cfg(feature = "dict")]
#[test]
fn test_dict_correct() {
let dict = BuiltIn::new(crate::config::Locale::default());
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finallizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalizes".into()]))
);
}
#[cfg(feature = "vars")]
#[test]
fn test_varcon_no_locale() {
let dict = BuiltIn::new(crate::config::Locale::En);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(correction, Some(Status::Valid));
}
#[cfg(feature = "vars")]
#[test]
fn test_varcon_same_locale() {
let dict = BuiltIn::new(crate::config::Locale::EnUs);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(correction, Some(Status::Valid));
}
#[cfg(feature = "vars")]
#[test]
fn test_varcon_different_locale() {
let dict = BuiltIn::new(crate::config::Locale::EnGb);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalises".into()]))
);
}
#[cfg(all(feature = "dict", feature = "vars"))]
#[test]
fn test_dict_to_varcon() {
let dict = BuiltIn::new(crate::config::Locale::EnGb);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finallizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalises".into()]))
);
}
#[test] #[test]
fn test_case_correct() { fn test_case_correct() {
let cases = [ let cases = [