mirror of
https://github.com/crate-ci/typos.git
synced 2024-12-23 08:02:15 -05:00
Merge pull request #740 from epage/compat
feat(dict): Pull in codespell items
This commit is contained in:
commit
7384c2cd19
8 changed files with 93683 additions and 11166 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -334,6 +334,7 @@ dependencies = [
|
||||||
"dictgen",
|
"dictgen",
|
||||||
"itertools",
|
"itertools",
|
||||||
"snapbox",
|
"snapbox",
|
||||||
|
"typos",
|
||||||
"unicase",
|
"unicase",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -25,3 +25,4 @@ itertools = "0.10"
|
||||||
codegenrs = "2.0"
|
codegenrs = "2.0"
|
||||||
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen"] }
|
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen"] }
|
||||||
snapbox = { version = "0.4.11", features = ["path"] }
|
snapbox = { version = "0.4.11", features = ["path"] }
|
||||||
|
typos = { path = "../typos" }
|
||||||
|
|
40952
crates/codespell-dict/assets/compatible.csv
Normal file
40952
crates/codespell-dict/assets/compatible.csv
Normal file
File diff suppressed because it is too large
Load diff
|
@ -10,6 +10,36 @@ fn codegen() {
|
||||||
snapbox::assert_eq_path("./src/dict_codegen.rs", &content);
|
snapbox::assert_eq_path("./src/dict_codegen.rs", &content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn compat() {
|
||||||
|
use std::fmt::Write as _;
|
||||||
|
|
||||||
|
let mut content = String::new();
|
||||||
|
for (bad, good) in parse_dict(DICT) {
|
||||||
|
if !is_word(bad) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if !good.iter().copied().all(is_word) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let bad = bad.to_lowercase();
|
||||||
|
write!(content, "{bad}").unwrap();
|
||||||
|
for good in good {
|
||||||
|
let good = good.to_lowercase();
|
||||||
|
write!(content, ",{good}").unwrap();
|
||||||
|
}
|
||||||
|
writeln!(content).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
snapbox::assert_eq_path("./assets/compatible.csv", &content);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_word(word: &str) -> bool {
|
||||||
|
let tokenizer = typos::tokens::Tokenizer::new();
|
||||||
|
|
||||||
|
tokenizer.parse_str(word).flat_map(|t| t.split()).count() == 1 && !word.contains('_')
|
||||||
|
}
|
||||||
|
|
||||||
fn generate<W: std::io::Write>(file: &mut W) {
|
fn generate<W: std::io::Write>(file: &mut W) {
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
|
|
1
crates/typos-dict/assets/allowed.csv
vendored
1
crates/typos-dict/assets/allowed.csv
vendored
|
@ -4,3 +4,4 @@ hardlinked,filesystem term
|
||||||
referer,http header field
|
referer,http header field
|
||||||
deques,noun
|
deques,noun
|
||||||
dequeues,verb
|
dequeues,verb
|
||||||
|
ons,so `add-ons` works
|
||||||
|
|
|
12566
crates/typos-dict/assets/words.csv
vendored
12566
crates/typos-dict/assets/words.csv
vendored
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,9 +1,10 @@
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use unicase::UniCase;
|
use unicase::UniCase;
|
||||||
|
|
||||||
type Dict = BTreeMap<UniCase<String>, Vec<String>>;
|
type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn verify() {
|
fn verify() {
|
||||||
|
@ -30,18 +31,20 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||||
let mut typo = i.next().expect("typo").to_owned();
|
let mut typo = i.next().expect("typo").to_owned();
|
||||||
typo.make_ascii_lowercase();
|
typo.make_ascii_lowercase();
|
||||||
let typo = UniCase::new(typo);
|
let typo = UniCase::new(typo);
|
||||||
rows.entry(typo).or_insert_with(Vec::new).extend(i.map(|c| {
|
rows.entry(typo)
|
||||||
let mut c = c.to_owned();
|
.or_insert_with(BTreeSet::new)
|
||||||
c.make_ascii_lowercase();
|
.extend(i.map(|c| {
|
||||||
c
|
let mut c = c.to_owned();
|
||||||
}));
|
c.make_ascii_lowercase();
|
||||||
|
c
|
||||||
|
}));
|
||||||
});
|
});
|
||||||
|
|
||||||
let rows: Dict = rows
|
let rows: Dict = rows
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|(t, _)| is_word(t))
|
.filter(|(t, _)| is_word(t))
|
||||||
.filter_map(|(t, c)| {
|
.filter_map(|(t, c)| {
|
||||||
let new_c: Vec<_> = c.into_iter().filter(|c| is_word(c)).collect();
|
let new_c: BTreeSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
|
||||||
if new_c.is_empty() {
|
if new_c.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
|
@ -53,7 +56,7 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||||
let varcon_words = varcon_words();
|
let varcon_words = varcon_words();
|
||||||
let allowed_words = allowed_words();
|
let allowed_words = allowed_words();
|
||||||
let word_variants = proper_word_variants();
|
let word_variants = proper_word_variants();
|
||||||
let rows: Dict = rows
|
let rows: Vec<_> = rows
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|(typo, _)| {
|
.filter(|(typo, _)| {
|
||||||
let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo));
|
let is_disallowed = varcon_words.contains(&unicase::UniCase::new(typo));
|
||||||
|
@ -71,29 +74,43 @@ fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.map(|(typo, corrections)| {
|
.map(|(typo, corrections)| {
|
||||||
let mut new_corrections = vec![];
|
let mut new_corrections = BTreeSet::new();
|
||||||
for correction in corrections {
|
for correction in corrections {
|
||||||
let correction = word_variants
|
let correction = word_variants
|
||||||
.get(correction.as_str())
|
.get(correction.as_str())
|
||||||
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
|
.and_then(|words| find_best_match(&typo, correction.as_str(), words))
|
||||||
.unwrap_or(&correction);
|
.unwrap_or(&correction);
|
||||||
new_corrections.push(correction.to_owned());
|
new_corrections.insert(correction.to_owned());
|
||||||
}
|
}
|
||||||
(typo, new_corrections)
|
(typo, new_corrections)
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
let mut dict = Dict::new();
|
||||||
|
for (bad, good) in rows {
|
||||||
|
let current = dict.entry(bad).or_default();
|
||||||
|
current.extend(good);
|
||||||
|
}
|
||||||
|
|
||||||
let corrections: std::collections::HashSet<_> =
|
let corrections: HashMap<_, _> = dict
|
||||||
rows.values().flatten().map(ToOwned::to_owned).collect();
|
.iter()
|
||||||
let rows: Vec<_> = rows
|
.flat_map(|(bad, good)| good.iter().map(|good| (good.to_owned(), bad.to_owned())))
|
||||||
|
.collect();
|
||||||
|
let rows: Vec<_> = dict
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|(typo, _)| !corrections.contains(typo.as_str()))
|
.filter(|(typo, _)| {
|
||||||
|
if let Some(correction) = corrections.get(typo.as_str()) {
|
||||||
|
eprintln!("{typo} <-> {correction} cycle detected");
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
|
let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(file);
|
||||||
for (typo, corrections) in rows {
|
for (typo, corrections) in rows {
|
||||||
let mut row = corrections;
|
let mut row = vec![typo.as_str().to_owned()];
|
||||||
row.insert(0, typo.as_str().to_owned());
|
row.extend(corrections);
|
||||||
wtr.write_record(&row).unwrap();
|
wtr.write_record(&row).unwrap();
|
||||||
}
|
}
|
||||||
wtr.flush().unwrap();
|
wtr.flush().unwrap();
|
||||||
|
|
Loading…
Reference in a new issue