mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-28 20:11:05 -05:00
refactor(dict): Allow 0..n corrections in BuiltIn
The main use case is taking `ther` -> `there` and adding `the` and `their`.
This commit is contained in:
parent
444d2cca91
commit
fb0dac4297
6 changed files with 33747 additions and 33668 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1526,6 +1526,7 @@ version = "1.2.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"codegenrs",
|
"codegenrs",
|
||||||
"csv",
|
"csv",
|
||||||
|
"itertools 0.10.0",
|
||||||
"phf",
|
"phf",
|
||||||
"phf_codegen",
|
"phf_codegen",
|
||||||
"structopt",
|
"structopt",
|
||||||
|
|
|
@ -18,6 +18,7 @@ codecov = { repository = "crate-ci/typos" }
|
||||||
phf = { version = "0.8", features = ["unicase"] }
|
phf = { version = "0.8", features = ["unicase"] }
|
||||||
phf_codegen = "0.8"
|
phf_codegen = "0.8"
|
||||||
csv = "1.1"
|
csv = "1.1"
|
||||||
|
itertools = "0.10"
|
||||||
unicase = "2.5"
|
unicase = "2.5"
|
||||||
codegenrs = "1.0"
|
codegenrs = "1.0"
|
||||||
structopt = "0.3"
|
structopt = "0.3"
|
||||||
|
|
|
@ -18,20 +18,26 @@ fn generate<W: std::io::Write>(file: &mut W) {
|
||||||
|
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
|
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut builder = phf_codegen::Map::new();
|
let mut builder = phf_codegen::Map::new();
|
||||||
let records: Vec<_> = csv::ReaderBuilder::new()
|
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||||
.has_headers(false)
|
.has_headers(false)
|
||||||
|
.flexible(true)
|
||||||
.from_reader(DICT)
|
.from_reader(DICT)
|
||||||
.records()
|
.records()
|
||||||
.map(|r| r.unwrap())
|
.map(|r| r.unwrap())
|
||||||
.collect();
|
.collect();
|
||||||
for record in &records {
|
for record in &records {
|
||||||
smallest = std::cmp::min(smallest, record[0].len());
|
let mut record_fields = record.iter();
|
||||||
largest = std::cmp::max(largest, record[0].len());
|
let key = record_fields.next().unwrap();
|
||||||
let value = format!(r#""{}""#, &record[1]);
|
smallest = std::cmp::min(smallest, key.len());
|
||||||
|
largest = std::cmp::max(largest, key.len());
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
|
||||||
|
);
|
||||||
builder.entry(unicase::UniCase::new(&record[0]), &value);
|
builder.entry(unicase::UniCase::new(&record[0]), &value);
|
||||||
}
|
}
|
||||||
let codegenned = builder.build();
|
let codegenned = builder.build();
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,26 +4,32 @@ use std::collections::HashSet;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||||
let mut wtr = csv::Writer::from_writer(file);
|
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
|
||||||
|
|
||||||
let disallowed_typos = varcon_words();
|
let disallowed_typos = varcon_words();
|
||||||
let word_variants = proper_word_variants();
|
let word_variants = proper_word_variants();
|
||||||
|
|
||||||
let mut reader = csv::ReaderBuilder::new()
|
let mut reader = csv::ReaderBuilder::new()
|
||||||
.has_headers(false)
|
.has_headers(false)
|
||||||
|
.flexible(true)
|
||||||
.from_reader(dict);
|
.from_reader(dict);
|
||||||
for record in reader.records() {
|
for record in reader.records() {
|
||||||
let record = record.unwrap();
|
let record = record.unwrap();
|
||||||
let typo = &record[0];
|
let mut record_fields = record.iter();
|
||||||
let correction = &record[1];
|
let typo = record_fields.next().unwrap();
|
||||||
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
|
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut row = vec![typo];
|
||||||
|
for correction in record_fields {
|
||||||
let correction = word_variants
|
let correction = word_variants
|
||||||
.get(correction)
|
.get(correction)
|
||||||
.and_then(|words| find_best_match(typo, correction, words))
|
.and_then(|words| find_best_match(typo, correction, words))
|
||||||
.unwrap_or(correction);
|
.unwrap_or(correction);
|
||||||
wtr.write_record(&[typo, correction]).unwrap();
|
row.push(correction);
|
||||||
|
}
|
||||||
|
wtr.write_record(&row).unwrap();
|
||||||
}
|
}
|
||||||
wtr.flush().unwrap();
|
wtr.flush().unwrap();
|
||||||
}
|
}
|
||||||
|
|
42
src/dict.rs
42
src/dict.rs
|
@ -34,14 +34,11 @@ impl BuiltIn {
|
||||||
}
|
}
|
||||||
|
|
||||||
let word = word_token.token();
|
let word = word_token.token();
|
||||||
let mut corrections = if let Some(correction) = self.correct_with_dict(word) {
|
let mut corrections = if let Some(corrections) = self.correct_with_dict(word) {
|
||||||
match self.correct_with_vars(correction) {
|
if corrections.is_empty() {
|
||||||
Some(Status::Valid) => Status::Corrections(vec![Cow::Borrowed(correction)]),
|
Status::Invalid
|
||||||
Some(correction @ Status::Corrections(_)) => correction,
|
} else {
|
||||||
Some(Status::Invalid) => {
|
self.chain_with_vars(corrections)?
|
||||||
unreachable!("correct_with_vars should always have valid suggestions")
|
|
||||||
}
|
|
||||||
None => Status::Corrections(vec![Cow::Borrowed(correction)]),
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
self.correct_with_vars(word)?
|
self.correct_with_vars(word)?
|
||||||
|
@ -54,7 +51,7 @@ impl BuiltIn {
|
||||||
|
|
||||||
#[cfg(feature = "dict")]
|
#[cfg(feature = "dict")]
|
||||||
// Not using `Status` to avoid the allocations
|
// Not using `Status` to avoid the allocations
|
||||||
fn correct_with_dict(&self, word: &str) -> Option<&'static str> {
|
fn correct_with_dict(&self, word: &str) -> Option<&'static [&'static str]> {
|
||||||
if typos_dict::WORD_RANGE.contains(&word.len()) {
|
if typos_dict::WORD_RANGE.contains(&word.len()) {
|
||||||
map_lookup(&typos_dict::WORD_DICTIONARY, word)
|
map_lookup(&typos_dict::WORD_DICTIONARY, word)
|
||||||
} else {
|
} else {
|
||||||
|
@ -63,10 +60,35 @@ impl BuiltIn {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(feature = "dict"))]
|
#[cfg(not(feature = "dict"))]
|
||||||
fn correct_with_dict(&self, _word: &str) -> Option<&'static str> {
|
fn correct_with_dict(&self, _word: &str) -> Option<&'static [&'static str]> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "vars")]
|
||||||
|
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Option<Status<'static>> {
|
||||||
|
let mut chained: Vec<_> = corrections
|
||||||
|
.iter()
|
||||||
|
.flat_map(|c| match self.correct_with_vars(c) {
|
||||||
|
Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
|
||||||
|
Some(Status::Corrections(vars)) => vars,
|
||||||
|
Some(Status::Invalid) => {
|
||||||
|
unreachable!("correct_with_vars should always have valid suggestions")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
if chained.len() != 1 {
|
||||||
|
chained.sort_unstable();
|
||||||
|
chained.dedup();
|
||||||
|
}
|
||||||
|
debug_assert!(!chained.is_empty());
|
||||||
|
Some(Status::Corrections(chained))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "vars"))]
|
||||||
|
fn chain_with_vars(&self, corrections: &[&str]) -> Option<Status<'static>> {
|
||||||
|
Status::Corrections(corrections.map(|c| Cow::Borrowed(correction).collect()))
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(feature = "vars")]
|
#[cfg(feature = "vars")]
|
||||||
fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
|
fn correct_with_vars(&self, word: &str) -> Option<Status<'static>> {
|
||||||
if typos_vars::WORD_RANGE.contains(&word.len()) {
|
if typos_vars::WORD_RANGE.contains(&word.len()) {
|
||||||
|
|
Loading…
Reference in a new issue