feat(dict): Preserve correction order

We want to be able to recommend more likely corrections first,
e.g. for "poped" we want to recommend "popped" before "pooped".
This commit is contained in:
Martin Fischer 2023-06-26 21:33:59 +02:00
parent 357aa55c6c
commit 8d026ac23e
3 changed files with 40 additions and 8 deletions

29
Cargo.lock generated
View file

@ -233,7 +233,7 @@ dependencies = [
"bitflags", "bitflags",
"clap_derive 3.2.18", "clap_derive 3.2.18",
"clap_lex 0.2.4", "clap_lex 0.2.4",
"indexmap", "indexmap 1.9.2",
"once_cell", "once_cell",
"strsim 0.10.0", "strsim 0.10.0",
"termcolor", "termcolor",
@ -615,6 +615,12 @@ dependencies = [
"termcolor", "termcolor",
] ]
[[package]]
name = "equivalent"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88bffebc5d80432c9b140ee17875ff173a8ab62faad5b257da912bd2f6c1c0a1"
[[package]] [[package]]
name = "errno" name = "errno"
version = "0.2.8" version = "0.2.8"
@ -733,6 +739,12 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.4.1" version = "0.4.1"
@ -826,7 +838,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"hashbrown", "hashbrown 0.12.3",
]
[[package]]
name = "indexmap"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d"
dependencies = [
"equivalent",
"hashbrown 0.14.0",
] ]
[[package]] [[package]]
@ -1584,7 +1606,7 @@ version = "0.19.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
dependencies = [ dependencies = [
"indexmap", "indexmap 1.9.2",
"serde", "serde",
"serde_spanned", "serde_spanned",
"toml_datetime", "toml_datetime",
@ -1685,6 +1707,7 @@ dependencies = [
"csv", "csv",
"dictgen", "dictgen",
"edit-distance", "edit-distance",
"indexmap 2.0.0",
"itertools", "itertools",
"snapbox", "snapbox",
"unicase", "unicase",

View file

@ -24,3 +24,4 @@ codegenrs = "2.0"
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen"] } dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen"] }
varcon = { version = "^0.6", path = "../varcon" } varcon = { version = "^0.6", path = "../varcon" }
snapbox = { version = "0.4.11", features = ["path"] } snapbox = { version = "0.4.11", features = ["path"] }
indexmap = "2.0.0"

View file

@ -1,10 +1,10 @@
use indexmap::IndexSet;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use unicase::UniCase; use unicase::UniCase;
type Dict = BTreeMap<UniCase<String>, BTreeSet<String>>; type Dict = BTreeMap<UniCase<String>, IndexSet<String>>;
#[test] #[test]
fn verify() { fn verify() {
@ -62,7 +62,7 @@ fn dict_from_iter<S: Into<String>>(
// duplicate entries are merged // duplicate entries are merged
dict.entry(typo) dict.entry(typo)
.or_insert_with(BTreeSet::new) .or_default()
.extend(corrections.into_iter().map(|c| { .extend(corrections.into_iter().map(|c| {
let mut c = c.into(); let mut c = c.into();
c.make_ascii_lowercase(); c.make_ascii_lowercase();
@ -82,7 +82,7 @@ fn process<S: Into<String>>(
.into_iter() .into_iter()
.filter(|(t, _)| is_word(t)) .filter(|(t, _)| is_word(t))
.filter_map(|(t, c)| { .filter_map(|(t, c)| {
let new_c: BTreeSet<_> = c.into_iter().filter(|c| is_word(c)).collect(); let new_c: IndexSet<_> = c.into_iter().filter(|c| is_word(c)).collect();
if new_c.is_empty() { if new_c.is_empty() {
None None
} else { } else {
@ -112,7 +112,7 @@ fn process<S: Into<String>>(
} }
}) })
.map(|(typo, corrections)| { .map(|(typo, corrections)| {
let mut new_corrections = BTreeSet::new(); let mut new_corrections = IndexSet::new();
for correction in corrections { for correction in corrections {
let correction = word_variants let correction = word_variants
.get(correction.as_str()) .get(correction.as_str())
@ -145,6 +145,14 @@ fn process<S: Into<String>>(
.collect() .collect()
} }
#[test]
fn test_preserve_correction_order() {
let dict = process([("foo", ["xyz", "abc"])]);
let mut corrections = dict.get(&UniCase::new("foo".into())).unwrap().iter();
assert_eq!(corrections.next().unwrap(), "xyz");
assert_eq!(corrections.next().unwrap(), "abc");
}
#[test] #[test]
fn test_merge_duplicates() { fn test_merge_duplicates() {
assert_eq!( assert_eq!(