Merge pull request #740 from epage/compat

feat(dict): Pull in codespell items
This commit is contained in:
Ed Page 2023-06-08 10:01:52 -05:00 committed by GitHub
commit 7384c2cd19
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 93683 additions and 11166 deletions

1
Cargo.lock generated
View file

@ -334,6 +334,7 @@ dependencies = [
"dictgen", "dictgen",
"itertools", "itertools",
"snapbox", "snapbox",
"typos",
"unicase", "unicase",
] ]

View file

@ -25,3 +25,4 @@ itertools = "0.10"
codegenrs = "2.0" codegenrs = "2.0"
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen"] } dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen"] }
snapbox = { version = "0.4.11", features = ["path"] } snapbox = { version = "0.4.11", features = ["path"] }
typos = { path = "../typos" }

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,36 @@ fn codegen() {
snapbox::assert_eq_path("./src/dict_codegen.rs", &content); snapbox::assert_eq_path("./src/dict_codegen.rs", &content);
} }
#[test]
fn compat() {
use std::fmt::Write as _;
let mut content = String::new();
for (bad, good) in parse_dict(DICT) {
if !is_word(bad) {
continue;
}
if !good.iter().copied().all(is_word) {
continue;
}
let bad = bad.to_lowercase();
write!(content, "{bad}").unwrap();
for good in good {
let good = good.to_lowercase();
write!(content, ",{good}").unwrap();
}
writeln!(content).unwrap();
}
snapbox::assert_eq_path("./assets/compatible.csv", &content);
}
fn is_word(word: &str) -> bool {
let tokenizer = typos::tokens::Tokenizer::new();
tokenizer.parse_str(word).flat_map(|t| t.split()).count() == 1 && !word.contains('_')
}
fn generate<W: std::io::Write>(file: &mut W) { fn generate<W: std::io::Write>(file: &mut W) {
writeln!( writeln!(
file, file,

View file

@ -4,3 +4,4 @@ hardlinked,filesystem term
referer,http header field referer,http header field
deques,noun deques,noun
dequeues,verb dequeues,verb
ons,so `add-ons` works

1 nilable used in ruby community
4 referer http header field
5 deques noun
6 dequeues verb
7 ons so `add-ons` works

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,10 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use unicase::UniCase; use unicase::UniCase;
type Dict = BTreeMap<UniCase<String>, Vec<String>>; type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;
#[test] #[test]
fn verify() { fn verify() {
@ -30,7 +31,9 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut typo = i.next().expect("typo").to_owned(); let mut typo = i.next().expect("typo").to_owned();
typo.make_ascii_lowercase(); typo.make_ascii_lowercase();
let typo = UniCase::new(typo); let typo = UniCase::new(typo);
rows.entry(typo).or_insert_with(Vec::new).extend(i.map(|c| { rows.entry(typo)
.or_insert_with(BTreeSet::new)
.extend(i.map(|c| {
let mut c = c.to_owned(); let mut c = c.to_owned();
c.make_ascii_lowercase(); c.make_ascii_lowercase();
c c
@ -41,7 +44,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
.into_iter() .into_iter()
.filter(|(t, _)| is_word(t)) .filter(|(t, _)| is_word(t))
.filter_map(|(t, c)| { .filter_map(|(t, c)| {
let new_c: Vec<_> = c.into_iter().filter(|c| is_word(c)).collect(); let new_c: BTreeSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
if new_c.is_empty() { if new_c.is_empty() {
None None
} else { } else {
@ -53,7 +56,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let varcon_words = varcon_words(); let varcon_words = varcon_words();
let allowed_words = allowed_words(); let allowed_words = allowed_words();
let word_variants = proper_word_variants(); let word_variants = proper_word_variants();
let rows: Dict = rows let rows: Vec<_> = rows
.into_iter() .into_iter()
.filter(|(typo, _)| { .filter(|(typo, _)| {
let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo)); let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo));
@ -71,29 +74,43 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
} }
}) })
.map(|(typo, corrections)| { .map(|(typo, corrections)| {
let mut new_corrections = vec![]; let mut new_corrections = BTreeSet::new();
for correction in corrections { for correction in corrections {
let correction = word_variants let correction = word_variants
.get(correction.as_str()) .get(correction.as_str())
.and_then(|words| find_best_match(&typo, correction.as_str(), words)) .and_then(|words| find_best_match(&typo, correction.as_str(), words))
.unwrap_or(&correction); .unwrap_or(&correction);
new_corrections.push(correction.to_owned()); new_corrections.insert(correction.to_owned());
} }
(typo, new_corrections) (typo, new_corrections)
}) })
.collect(); .collect();
let mut dict = Dict::new();
for (bad, good) in rows {
let current = dict.entry(bad).or_default();
current.extend(good);
}
let corrections: std::collections::HashSet<_> = let corrections: HashMap<_, _> = dict
rows.values().flatten().map(ToOwned::to_owned).collect(); .iter()
let rows: Vec<_> = rows .flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
.collect();
let rows: Vec<_> = dict
.into_iter() .into_iter()
.filter(|(typo, _)| !corrections.contains(typo.as_str())) .filter(|(typo, _)| {
if let Some(correction) = corrections.get(typo.as_str()) {
eprintln!("{typo} <-> {correction} cycle detected");
false
} else {
true
}
})
.collect(); .collect();
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file); let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
for (typo, corrections) in rows { for (typo, corrections) in rows {
let mut row = corrections; let mut row = vec![typo.as_str().to_owned()];
row.insert(0, typo.as_str().to_owned()); row.extend(corrections);
wtr.write_record(&row).unwrap(); wtr.write_record(&row).unwrap();
} }
wtr.flush().unwrap(); wtr.flush().unwrap();