refactor(dict): Allow 0..n corrections in BuiltIn

The main use case is taking `ther` -> `there` and adding `the` and
`their`.
This commit is contained in:
Ed Page 2021-05-15 19:06:04 -05:00
parent 444d2cca91
commit fb0dac4297
6 changed files with 33747 additions and 33668 deletions

1
Cargo.lock generated
View file

@ -1526,6 +1526,7 @@ version = "1.2.0"
dependencies = [ dependencies = [
"codegenrs", "codegenrs",
"csv", "csv",
"itertools 0.10.0",
"phf", "phf",
"phf_codegen", "phf_codegen",
"structopt", "structopt",

View file

@ -18,6 +18,7 @@ codecov = { repository = "crate-ci/typos" }
phf = { version = "0.8", features = ["unicase"] } phf = { version = "0.8", features = ["unicase"] }
phf_codegen = "0.8" phf_codegen = "0.8"
csv = "1.1" csv = "1.1"
itertools = "0.10"
unicase = "2.5" unicase = "2.5"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"

View file

@ -18,20 +18,26 @@ fn generate<W: std::io::Write>(file: &mut W) {
writeln!( writeln!(
file, file,
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = " "pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
) )
.unwrap(); .unwrap();
let mut builder = phf_codegen::Map::new(); let mut builder = phf_codegen::Map::new();
let records: Vec<_> = csv::ReaderBuilder::new() let records: Vec<_> = csv::ReaderBuilder::new()
.has_headers(false) .has_headers(false)
.flexible(true)
.from_reader(DICT) .from_reader(DICT)
.records() .records()
.map(|r| r.unwrap()) .map(|r| r.unwrap())
.collect(); .collect();
for record in &records { for record in &records {
smallest = std::cmp::min(smallest, record[0].len()); let mut record_fields = record.iter();
largest = std::cmp::max(largest, record[0].len()); let key = record_fields.next().unwrap();
let value = format!(r#""{}""#, &record[1]); smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
);
builder.entry(unicase::UniCase::new(&record[0]), &value); builder.entry(unicase::UniCase::new(&record[0]), &value);
} }
let codegenned = builder.build(); let codegenned = builder.build();

File diff suppressed because it is too large Load diff

View file

@ -4,26 +4,32 @@ use std::collections::HashSet;
use structopt::StructOpt; use structopt::StructOpt;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) { fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::Writer::from_writer(file); let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
let disallowed_typos = varcon_words(); let disallowed_typos = varcon_words();
let word_variants = proper_word_variants(); let word_variants = proper_word_variants();
let mut reader = csv::ReaderBuilder::new() let mut reader = csv::ReaderBuilder::new()
.has_headers(false) .has_headers(false)
.flexible(true)
.from_reader(dict); .from_reader(dict);
for record in reader.records() { for record in reader.records() {
let record = record.unwrap(); let record = record.unwrap();
let typo = &record[0]; let mut record_fields = record.iter();
let correction = &record[1]; let typo = record_fields.next().unwrap();
if disallowed_typos.contains(&unicase::UniCase::new(typo)) { if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue; continue;
} }
let mut row = vec![typo];
for correction in record_fields {
let correction = word_variants let correction = word_variants
.get(correction) .get(correction)
.and_then(|words| find_best_match(typo, correction, words)) .and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction); .unwrap_or(correction);
wtr.write_record(&[typo, correction]).unwrap(); row.push(correction);
}
wtr.write_record(&row).unwrap();
} }
wtr.flush().unwrap(); wtr.flush().unwrap();
} }

View file

@ -34,14 +34,11 @@ impl BuiltIn {
} }
let word = word_token.token(); let word = word_token.token();
let mut corrections = if let Some(correction) = self.correct_with_dict(word) { let mut corrections = if let Some(corrections) = self.correct_with_dict(word) {
match self.correct_with_vars(correction) { if corrections.is_empty() {
Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]), Status::Invalid
Some(correction @ Status::Corrections(_)) => correction, } else {
Some(Status::Invalid) => { self.chain_with_vars(corrections)?
unreachable!("correct_with_vars should always have valid suggestions")
}
None => Status::Corrections(vec![Cow::Borrowed(correction)]),
} }
} else { } else {
self.correct_with_vars(word)? self.correct_with_vars(word)?
@ -54,7 +51,7 @@ impl BuiltIn {
#[cfg(feature = "dict")] #[cfg(feature = "dict")]
// Not using `Status` to avoid the allocations // Not using `Status` to avoid the allocations
fn correct_with_dict(&self, word: &str) -> Option<&'static str> { fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> {
if typos_dict::WORD_RANGE.contains(&word.len()) { if typos_dict::WORD_RANGE.contains(&word.len()) {
map_lookup(&typos_dict::WORD_DICTIONARY, word) map_lookup(&typos_dict::WORD_DICTIONARY, word)
} else { } else {
@ -63,10 +60,35 @@ impl BuiltIn {
} }
#[cfg(not(feature = "dict"))] #[cfg(not(feature = "dict"))]
fn correct_with_dict(&self, _word: &str) -> Option<&'static str> { fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> {
None None
} }
#[cfg(feature = "vars")]
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Option<Status<'static>> {
let mut chained: Vec<_> = corrections
.iter()
.flat_map(|c| match self.correct_with_vars(c) {
Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
Some(Status::Corrections(vars)) => vars,
Some(Status::Invalid) => {
unreachable!("correct_with_vars should always have valid suggestions")
}
})
.collect();
if chained.len() != 1 {
chained.sort_unstable();
chained.dedup();
}
debug_assert!(!chained.is_empty());
Some(Status::Corrections(chained))
}
#[cfg(not(feature = "vars"))]
fn chain_with_vars(&self, corrections: &[&str]) -> Option<Status<'static>> {
Status::Corrections(corrections.map(|c| Cow::Borrowed(correction).collect()))
}
#[cfg(feature = "vars")] #[cfg(feature = "vars")]
fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> { fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
if typos_vars::WORD_RANGE.contains(&word.len()) { if typos_vars::WORD_RANGE.contains(&word.len()) {