feat(dict): Perform case-insensitive comparisons

This commit is contained in:
Ed Page 2019-04-17 07:30:17 -06:00
parent 719cc7d43b
commit af66072272
4 changed files with 55 additions and 14 deletions

17
Cargo.lock generated
View file

@ -162,6 +162,7 @@ dependencies = [
"serde_derive 1.0.85 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.85 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.36 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.36 (registry+https://github.com/rust-lang/crates.io-index)",
"structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", "structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)",
"unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -434,6 +435,7 @@ version = "0.7.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -786,6 +788,14 @@ name = "ucd-util"
version = "0.1.3" version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicase"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "unicode-segmentation" name = "unicode-segmentation"
version = "1.2.1" version = "1.2.1"
@ -819,6 +829,11 @@ name = "vec_map"
version = "0.8.1" version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "version_check"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "void" name = "void"
version = "1.0.2" version = "1.0.2"
@ -955,12 +970,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
"checksum treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41" "checksum treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41"
"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86"
"checksum unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7f4765f83163b74f957c797ad9253caf97f103fb064d3999aea9568d09fc8a33"
"checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" "checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1"
"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56"
"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737"
"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1" "checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1"
"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0"

View file

@ -20,7 +20,7 @@ failure = "0.1"
structopt = "0.2" structopt = "0.2"
clap = "2" clap = "2"
ignore = "0.4" ignore = "0.4"
phf = "0.7" phf = { version = "0.7", features = ["unicase"] }
regex = "1.0" regex = "1.0"
lazy_static = "1.2.0" lazy_static = "1.2.0"
grep-searcher = "0.1" grep-searcher = "0.1"
@ -28,6 +28,7 @@ serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
serde_json = "1.0" serde_json = "1.0"
itertools = "0.8" itertools = "0.8"
unicase = "1.1"
[dev-dependencies] [dev-dependencies]
assert_fs = "0.10" assert_fs = "0.10"
@ -35,3 +36,4 @@ assert_fs = "0.10"
[build-dependencies] [build-dependencies]
phf_codegen = "0.7" phf_codegen = "0.7"
csv = "1.0" csv = "1.0"
unicase = "1.1"

View file

@ -10,15 +10,21 @@ fn main() {
let mut file = BufWriter::new(File::create(&path).unwrap()); let mut file = BufWriter::new(File::create(&path).unwrap());
println!("rerun-if-changed=./assets/words.csv"); println!("rerun-if-changed=./assets/words.csv");
write!(&mut file, "static DICTIONARY: phf::Map<&'static str, &'static str> = ").unwrap(); write!(&mut file, "use unicase::UniCase;").unwrap();
write!(
&mut file,
"static DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
)
.unwrap();
let mut builder = phf_codegen::Map::new(); let mut builder = phf_codegen::Map::new();
let records: Vec<_> = csv::Reader::from_reader(CORPUS).records().map(|r| r.unwrap()).collect(); let records: Vec<_> = csv::Reader::from_reader(CORPUS)
.records()
.map(|r| r.unwrap())
.collect();
for record in &records { for record in &records {
let value = format!(r#""{}""#, &record[1]); let value = format!(r#""{}""#, &record[1]);
builder.entry(&record[0], &value); builder.entry(unicase::UniCase(&record[0]), &value);
} }
builder builder.build(&mut file).unwrap();
.build(&mut file)
.unwrap();
write!(&mut file, ";\n").unwrap(); write!(&mut file, ";\n").unwrap();
} }

View file

@ -1,18 +1,34 @@
include!(concat!(env!("OUT_DIR"), "/codegen.rs")); include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
pub struct Dictionary { pub struct Dictionary {}
}
impl Dictionary { impl Dictionary {
pub fn new() -> Self { pub fn new() -> Self {
Dictionary { } Dictionary {}
} }
pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> { pub fn correct_str<'s, 'w>(&'s self, word: &'w str) -> Option<&'s str> {
DICTIONARY.get(word).map(|s| *s) map_lookup(&DICTIONARY, word)
} }
pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s str> { pub fn correct_bytes<'s, 'w>(&'s self, word: &'w [u8]) -> Option<&'s str> {
std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| *s) std::str::from_utf8(word)
.ok()
.and_then(|word| self.correct_str(word))
}
}
fn map_lookup(
map: &'static phf::Map<UniCase<&'static str>, &'static str>,
key: &str,
) -> Option<&'static str> {
// This transmute should be safe as `get` will not store the reference with
// the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`.
//
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
unsafe {
let key = ::std::mem::transmute::<_, &'static str>(key);
map.get(&UniCase(key)).map(|s| *s)
} }
} }