Merge pull request #252 from epage/dict

fix(dict): Handle cases from Linux
This commit is contained in:
Ed Page 2021-05-18 13:13:58 -05:00 committed by GitHub
commit e6c595c585
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 33748 additions and 33671 deletions

1
Cargo.lock generated
View file

@ -1526,6 +1526,7 @@ version = "1.2.0"
dependencies = [
"codegenrs",
"csv",
"itertools 0.10.0",
"phf",
"phf_codegen",
"structopt",

View file

@ -13021,7 +13021,6 @@ handelbars,handlebars
handicaped,handicapped
handwritng,handwriting
harasments,harassments
hardlinked,hardline
harmoniacs,harmonic
harmonisch,harmonic
harrasment,harassment
@ -23557,7 +23556,7 @@ referens,references
referere,referee
referign,refering
refering,referring
refernce,references
refernce,reference
reffered,referred
refilles,refills
refillls,refills
@ -27976,7 +27975,7 @@ tast,taste
tath,that
tehy,they
tghe,the
ther,there
ther,there,their,the
thge,the
thna,than
thne,then

Can't render this file because it is too large.

View file

@ -18,6 +18,7 @@ codecov = { repository = "crate-ci/typos" }
phf = { version = "0.8", features = ["unicase"] }
phf_codegen = "0.8"
csv = "1.1"
itertools = "0.10"
unicase = "2.5"
codegenrs = "1.0"
structopt = "0.3"

View file

@ -18,20 +18,26 @@ fn generate<W: std::io::Write>(file: &mut W) {
writeln!(
file,
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
)
.unwrap();
let mut builder = phf_codegen::Map::new();
let records: Vec<_> = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(DICT)
.records()
.map(|r| r.unwrap())
.collect();
for record in &records {
smallest = std::cmp::min(smallest, record[0].len());
largest = std::cmp::max(largest, record[0].len());
let value = format!(r#""{}""#, &record[1]);
let mut record_fields = record.iter();
let key = record_fields.next().unwrap();
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
);
builder.entry(unicase::UniCase::new(&record[0]), &value);
}
let codegenned = builder.build();

File diff suppressed because it is too large Load diff

View file

@ -4,26 +4,32 @@ use std::collections::HashSet;
use structopt::StructOpt;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::Writer::from_writer(file);
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
let disallowed_typos = varcon_words();
let word_variants = proper_word_variants();
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict);
for record in reader.records() {
let record = record.unwrap();
let typo = &record[0];
let correction = &record[1];
let mut record_fields = record.iter();
let typo = record_fields.next().unwrap();
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue;
}
let correction = word_variants
.get(correction)
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
wtr.write_record(&[typo, correction]).unwrap();
let mut row = vec![typo];
for correction in record_fields {
let correction = word_variants
.get(correction)
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
row.push(correction);
}
wtr.write_record(&row).unwrap();
}
wtr.flush().unwrap();
}

View file

@ -34,14 +34,11 @@ impl BuiltIn {
}
let word = word_token.token();
let mut corrections = if let Some(correction) = self.correct_with_dict(word) {
match self.correct_with_vars(correction) {
Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]),
Some(correction @ Status::Corrections(_)) => correction,
Some(Status::Invalid) => {
unreachable!("correct_with_vars should always have valid suggestions")
}
None => Status::Corrections(vec![Cow::Borrowed(correction)]),
let mut corrections = if let Some(corrections) = self.correct_with_dict(word) {
if corrections.is_empty() {
Status::Invalid
} else {
self.chain_with_vars(corrections)
}
} else {
self.correct_with_vars(word)?
@ -54,7 +51,7 @@ impl BuiltIn {
#[cfg(feature = "dict")]
// Not using `Status` to avoid the allocations
fn correct_with_dict(&self, word: &str) -> Option<&'static str> {
fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> {
if typos_dict::WORD_RANGE.contains(&word.len()) {
map_lookup(&typos_dict::WORD_DICTIONARY, word)
} else {
@ -63,10 +60,35 @@ impl BuiltIn {
}
#[cfg(not(feature = "dict"))]
fn correct_with_dict(&self, _word: &str) -> Option<&'static str> {
fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> {
None
}
#[cfg(feature = "vars")]
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Status<'static> {
let mut chained: Vec<_> = corrections
.iter()
.flat_map(|c| match self.correct_with_vars(c) {
Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
Some(Status::Corrections(vars)) => vars,
Some(Status::Invalid) => {
unreachable!("correct_with_vars should always have valid suggestions")
}
})
.collect();
if chained.len() != 1 {
chained.sort_unstable();
chained.dedup();
}
debug_assert!(!chained.is_empty());
Status::Corrections(chained)
}
#[cfg(not(feature = "vars"))]
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Status<'static> {
Status::Corrections(corrections.iter().map(|c| Cow::Borrowed(*c)).collect())
}
#[cfg(feature = "vars")]
fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
if typos_vars::WORD_RANGE.contains(&word.len()) {