Merge pull request #302 from epage/trie

refactor(dict): Change typos-dict to trie
This commit is contained in:
Ed Page 2021-07-01 10:59:59 -05:00 committed by GitHub
commit 97015b3a95
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 91836 additions and 33843 deletions

113
Cargo.lock generated
View file

@ -23,7 +23,7 @@ version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98"
dependencies = [ dependencies = [
"getrandom 0.2.3", "getrandom",
"once_cell", "once_cell",
"version_check", "version_check",
] ]
@ -578,17 +578,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
[[package]]
name = "getrandom"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if",
"libc",
"wasi 0.9.0+wasi-snapshot-preview1",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.2.3" version = "0.2.3"
@ -597,7 +586,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"libc", "libc",
"wasi 0.10.2+wasi-snapshot-preview1", "wasi",
] ]
[[package]] [[package]]
@ -913,26 +902,6 @@ dependencies = [
"phf_shared", "phf_shared",
] ]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand 0.7.3",
]
[[package]] [[package]]
name = "phf_shared" name = "phf_shared"
version = "0.8.0" version = "0.8.0"
@ -1066,20 +1035,6 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom 0.1.16",
"libc",
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
"rand_pcg",
]
[[package]] [[package]]
name = "rand" name = "rand"
version = "0.8.3" version = "0.8.3"
@ -1087,19 +1042,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
dependencies = [ dependencies = [
"libc", "libc",
"rand_chacha 0.3.0", "rand_chacha",
"rand_core 0.6.2", "rand_core",
"rand_hc 0.3.0", "rand_hc",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core 0.5.1",
] ]
[[package]] [[package]]
@ -1109,16 +1054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d"
dependencies = [ dependencies = [
"ppv-lite86", "ppv-lite86",
"rand_core 0.6.2", "rand_core",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom 0.1.16",
] ]
[[package]] [[package]]
@ -1127,16 +1063,7 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
dependencies = [ dependencies = [
"getrandom 0.2.3", "getrandom",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"rand_core 0.5.1",
] ]
[[package]] [[package]]
@ -1145,16 +1072,7 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73"
dependencies = [ dependencies = [
"rand_core 0.6.2", "rand_core",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
] ]
[[package]] [[package]]
@ -1400,7 +1318,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"libc", "libc",
"rand 0.8.3", "rand",
"redox_syscall", "redox_syscall",
"remove_dir_all", "remove_dir_all",
"winapi", "winapi",
@ -1548,8 +1466,8 @@ dependencies = [
name = "typos-dict" name = "typos-dict"
version = "0.5.0" version = "0.5.0"
dependencies = [ dependencies = [
"dictgen",
"log", "log",
"phf",
"unicase", "unicase",
] ]
@ -1559,9 +1477,8 @@ version = "1.3.0"
dependencies = [ dependencies = [
"codegenrs", "codegenrs",
"csv", "csv",
"dictgen",
"itertools 0.10.0", "itertools 0.10.0",
"phf",
"phf_codegen",
"structopt", "structopt",
"unicase", "unicase",
] ]
@ -1646,7 +1563,7 @@ version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
dependencies = [ dependencies = [
"getrandom 0.2.3", "getrandom",
] ]
[[package]] [[package]]
@ -1705,12 +1622,6 @@ dependencies = [
"winapi-util", "winapi-util",
] ]
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]] [[package]]
name = "wasi" name = "wasi"
version = "0.10.2+wasi-snapshot-preview1" version = "0.10.2+wasi-snapshot-preview1"

View file

@ -15,6 +15,6 @@ azure-devops = { project = "crate-ci", pipeline = "typos" }
codecov = { repository = "crate-ci/typos" } codecov = { repository = "crate-ci/typos" }
[dependencies] [dependencies]
phf = { version = "0.8", features = ["unicase"] }
unicase = "2.5" unicase = "2.5"
log = "0.4" log = "0.4"
dictgen = { version = "0.1", path = "../dictgen" }

View file

@ -18,10 +18,9 @@ codecov = { repository = "crate-ci/typos" }
disable-release = true disable-release = true
[dependencies] [dependencies]
phf = { version = "0.8", features = ["unicase"] }
phf_codegen = "0.8"
csv = "1.1" csv = "1.1"
itertools = "0.10" itertools = "0.10"
unicase = "2.5" unicase = "2.5"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" }

View file

@ -11,17 +11,7 @@ fn generate<W: std::io::Write>(file: &mut W) {
.unwrap(); .unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap(); writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file).unwrap(); writeln!(file).unwrap();
writeln!(file, "use unicase::UniCase;").unwrap();
let mut smallest = usize::MAX;
let mut largest = usize::MIN;
writeln!(
file,
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
)
.unwrap();
let mut builder = phf_codegen::Map::new();
let records: Vec<_> = csv::ReaderBuilder::new() let records: Vec<_> = csv::ReaderBuilder::new()
.has_headers(false) .has_headers(false)
.flexible(true) .flexible(true)
@ -29,25 +19,20 @@ fn generate<W: std::io::Write>(file: &mut W) {
.records() .records()
.map(|r| r.unwrap()) .map(|r| r.unwrap())
.collect(); .collect();
for record in &records { dictgen::generate_trie(
file,
"WORD",
"&'static [&'static str]",
records.iter().map(|record| {
let mut record_fields = record.iter(); let mut record_fields = record.iter();
let key = record_fields.next().unwrap(); let key = record_fields.next().unwrap();
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let value = format!( let value = format!(
"&[{}]", "&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ") itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
); );
builder.entry(unicase::UniCase::new(&record[0]), &value); (key, value)
} }),
let codegenned = builder.build(); 64,
writeln!(file, "{}", codegenned).unwrap();
writeln!(file, ";").unwrap();
writeln!(file).unwrap();
writeln!(
file,
"pub const WORD_RANGE: std::ops::RangeInclusive<usize> = {}..={};",
smallest, largest
) )
.unwrap(); .unwrap();
} }

File diff suppressed because it is too large Load diff

View file

@ -55,11 +55,7 @@ impl BuiltIn {
impl BuiltIn { impl BuiltIn {
// Not using `Status` to avoid the allocations // Not using `Status` to avoid the allocations
fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> { fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
if typos_dict::WORD_RANGE.contains(&word.len()) { typos_dict::WORD_TRIE.find(&word).copied()
map_lookup(&typos_dict::WORD_DICTIONARY, word)
} else {
None
}
} }
} }
@ -175,22 +171,6 @@ impl typos::Dictionary for BuiltIn {
} }
} }
fn map_lookup<V: Clone>(
map: &'static phf::Map<UniCase<&'static str>, V>,
key: unicase::UniCase<&str>,
) -> Option<V> {
// This transmute should be safe as `get` will not store the reference with
// the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`.
//
//
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
unsafe {
let key = ::std::mem::transmute::<_, unicase::UniCase<&'static str>>(key);
map.get(&key).cloned()
}
}
fn case_correct(correction: &mut Cow<'_, str>, case: Case) { fn case_correct(correction: &mut Cow<'_, str>, case: Case) {
match case { match case {
Case::Lower | Case::None => (), Case::Lower | Case::None => (),