mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-28 12:01:06 -05:00
refactor(dict): Allow 0..n corrections in BuiltIn
The main use case is taking `ther` -> `there` and adding `the` and `their`.
This commit is contained in:
parent
444d2cca91
commit
fb0dac4297
6 changed files with 33747 additions and 33668 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1526,6 +1526,7 @@ version = "1.2.0"
|
|||
dependencies = [
|
||||
"codegenrs",
|
||||
"csv",
|
||||
"itertools 0.10.0",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"structopt",
|
||||
|
|
|
@ -18,6 +18,7 @@ codecov = { repository = "crate-ci/typos" }
|
|||
phf = { version = "0.8", features = ["unicase"] }
|
||||
phf_codegen = "0.8"
|
||||
csv = "1.1"
|
||||
itertools = "0.10"
|
||||
unicase = "2.5"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
|
|
|
@ -18,20 +18,26 @@ fn generate<W: std::io::Write>(file: &mut W) {
|
|||
|
||||
writeln!(
|
||||
file,
|
||||
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
|
||||
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
|
||||
)
|
||||
.unwrap();
|
||||
let mut builder = phf_codegen::Map::new();
|
||||
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||
.has_headers(false)
|
||||
.flexible(true)
|
||||
.from_reader(DICT)
|
||||
.records()
|
||||
.map(|r| r.unwrap())
|
||||
.collect();
|
||||
for record in &records {
|
||||
smallest = std::cmp::min(smallest, record[0].len());
|
||||
largest = std::cmp::max(largest, record[0].len());
|
||||
let value = format!(r#""{}""#, &record[1]);
|
||||
let mut record_fields = record.iter();
|
||||
let key = record_fields.next().unwrap();
|
||||
smallest = std::cmp::min(smallest, key.len());
|
||||
largest = std::cmp::max(largest, key.len());
|
||||
let value = format!(
|
||||
"&[{}]",
|
||||
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
|
||||
);
|
||||
builder.entry(unicase::UniCase::new(&record[0]), &value);
|
||||
}
|
||||
let codegenned = builder.build();
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,26 +4,32 @@ use std::collections::HashSet;
|
|||
use structopt::StructOpt;
|
||||
|
||||
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||
let mut wtr = csv::Writer::from_writer(file);
|
||||
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
|
||||
|
||||
let disallowed_typos = varcon_words();
|
||||
let word_variants = proper_word_variants();
|
||||
|
||||
let mut reader = csv::ReaderBuilder::new()
|
||||
.has_headers(false)
|
||||
.flexible(true)
|
||||
.from_reader(dict);
|
||||
for record in reader.records() {
|
||||
let record = record.unwrap();
|
||||
let typo = &record[0];
|
||||
let correction = &record[1];
|
||||
let mut record_fields = record.iter();
|
||||
let typo = record_fields.next().unwrap();
|
||||
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut row = vec![typo];
|
||||
for correction in record_fields {
|
||||
let correction = word_variants
|
||||
.get(correction)
|
||||
.and_then(|words| find_best_match(typo, correction, words))
|
||||
.unwrap_or(correction);
|
||||
wtr.write_record(&[typo, correction]).unwrap();
|
||||
row.push(correction);
|
||||
}
|
||||
wtr.write_record(&row).unwrap();
|
||||
}
|
||||
wtr.flush().unwrap();
|
||||
}
|
||||
|
|
42
src/dict.rs
42
src/dict.rs
|
@ -34,14 +34,11 @@ impl BuiltIn {
|
|||
}
|
||||
|
||||
let word = word_token.token();
|
||||
let mut corrections = if let Some(correction) = self.correct_with_dict(word) {
|
||||
match self.correct_with_vars(correction) {
|
||||
Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]),
|
||||
Some(correction @ Status::Corrections(_)) => correction,
|
||||
Some(Status::Invalid) => {
|
||||
unreachable!("correct_with_vars should always have valid suggestions")
|
||||
}
|
||||
None => Status::Corrections(vec![Cow::Borrowed(correction)]),
|
||||
let mut corrections = if let Some(corrections) = self.correct_with_dict(word) {
|
||||
if corrections.is_empty() {
|
||||
Status::Invalid
|
||||
} else {
|
||||
self.chain_with_vars(corrections)?
|
||||
}
|
||||
} else {
|
||||
self.correct_with_vars(word)?
|
||||
|
@ -54,7 +51,7 @@ impl BuiltIn {
|
|||
|
||||
#[cfg(feature = "dict")]
|
||||
// Not using `Status` to avoid the allocations
|
||||
fn correct_with_dict(&self, word: &str) -> Option<&'static str> {
|
||||
fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> {
|
||||
if typos_dict::WORD_RANGE.contains(&word.len()) {
|
||||
map_lookup(&typos_dict::WORD_DICTIONARY, word)
|
||||
} else {
|
||||
|
@ -63,10 +60,35 @@ impl BuiltIn {
|
|||
}
|
||||
|
||||
#[cfg(not(feature = "dict"))]
|
||||
fn correct_with_dict(&self, _word: &str) -> Option<&'static str> {
|
||||
fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> {
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(feature = "vars")]
|
||||
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Option<Status<'static>> {
|
||||
let mut chained: Vec<_> = corrections
|
||||
.iter()
|
||||
.flat_map(|c| match self.correct_with_vars(c) {
|
||||
Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
|
||||
Some(Status::Corrections(vars)) => vars,
|
||||
Some(Status::Invalid) => {
|
||||
unreachable!("correct_with_vars should always have valid suggestions")
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
if chained.len() != 1 {
|
||||
chained.sort_unstable();
|
||||
chained.dedup();
|
||||
}
|
||||
debug_assert!(!chained.is_empty());
|
||||
Some(Status::Corrections(chained))
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "vars"))]
|
||||
fn chain_with_vars(&self, corrections: &[&str]) -> Option<Status<'static>> {
|
||||
Status::Corrections(corrections.map(|c| Cow::Borrowed(correction).collect()))
|
||||
}
|
||||
|
||||
#[cfg(feature = "vars")]
|
||||
fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
|
||||
if typos_vars::WORD_RANGE.contains(&word.len()) {
|
||||
|
|
Loading…
Reference in a new issue