mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-22 00:51:11 -05:00
refactor(dict): Change typos-dict to trie
This is +/- 15%, depending on the benchmark.
This commit is contained in:
parent
fa1119aa47
commit
c8d1058a71
6 changed files with 91836 additions and 33843 deletions
113
Cargo.lock
generated
113
Cargo.lock
generated
|
@ -23,7 +23,7 @@ version = "0.7.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98"
|
checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.2.3",
|
"getrandom",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"version_check",
|
"version_check",
|
||||||
]
|
]
|
||||||
|
@ -578,17 +578,6 @@ version = "1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
|
checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "getrandom"
|
|
||||||
version = "0.1.16"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
|
|
||||||
dependencies = [
|
|
||||||
"cfg-if",
|
|
||||||
"libc",
|
|
||||||
"wasi 0.9.0+wasi-snapshot-preview1",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.2.3"
|
version = "0.2.3"
|
||||||
|
@ -597,7 +586,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"libc",
|
"libc",
|
||||||
"wasi 0.10.2+wasi-snapshot-preview1",
|
"wasi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -913,26 +902,6 @@ dependencies = [
|
||||||
"phf_shared",
|
"phf_shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "phf_codegen"
|
|
||||||
version = "0.8.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
|
|
||||||
dependencies = [
|
|
||||||
"phf_generator",
|
|
||||||
"phf_shared",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "phf_generator"
|
|
||||||
version = "0.8.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
|
|
||||||
dependencies = [
|
|
||||||
"phf_shared",
|
|
||||||
"rand 0.7.3",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf_shared"
|
name = "phf_shared"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
|
@ -1066,20 +1035,6 @@ version = "0.5.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
|
checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rand"
|
|
||||||
version = "0.7.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
|
||||||
dependencies = [
|
|
||||||
"getrandom 0.1.16",
|
|
||||||
"libc",
|
|
||||||
"rand_chacha 0.2.2",
|
|
||||||
"rand_core 0.5.1",
|
|
||||||
"rand_hc 0.2.0",
|
|
||||||
"rand_pcg",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rand"
|
name = "rand"
|
||||||
version = "0.8.3"
|
version = "0.8.3"
|
||||||
|
@ -1087,19 +1042,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
|
checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"rand_chacha 0.3.0",
|
"rand_chacha",
|
||||||
"rand_core 0.6.2",
|
"rand_core",
|
||||||
"rand_hc 0.3.0",
|
"rand_hc",
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rand_chacha"
|
|
||||||
version = "0.2.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
|
||||||
dependencies = [
|
|
||||||
"ppv-lite86",
|
|
||||||
"rand_core 0.5.1",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1109,16 +1054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d"
|
checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ppv-lite86",
|
"ppv-lite86",
|
||||||
"rand_core 0.6.2",
|
"rand_core",
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rand_core"
|
|
||||||
version = "0.5.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
|
||||||
dependencies = [
|
|
||||||
"getrandom 0.1.16",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1127,16 +1063,7 @@ version = "0.6.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
|
checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.2.3",
|
"getrandom",
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rand_hc"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
|
||||||
dependencies = [
|
|
||||||
"rand_core 0.5.1",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1145,16 +1072,7 @@ version = "0.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73"
|
checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rand_core 0.6.2",
|
"rand_core",
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rand_pcg"
|
|
||||||
version = "0.2.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
|
|
||||||
dependencies = [
|
|
||||||
"rand_core 0.5.1",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1400,7 +1318,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"libc",
|
"libc",
|
||||||
"rand 0.8.3",
|
"rand",
|
||||||
"redox_syscall",
|
"redox_syscall",
|
||||||
"remove_dir_all",
|
"remove_dir_all",
|
||||||
"winapi",
|
"winapi",
|
||||||
|
@ -1548,8 +1466,8 @@ dependencies = [
|
||||||
name = "typos-dict"
|
name = "typos-dict"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"dictgen",
|
||||||
"log",
|
"log",
|
||||||
"phf",
|
|
||||||
"unicase",
|
"unicase",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1559,9 +1477,8 @@ version = "1.3.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"codegenrs",
|
"codegenrs",
|
||||||
"csv",
|
"csv",
|
||||||
|
"dictgen",
|
||||||
"itertools 0.10.0",
|
"itertools 0.10.0",
|
||||||
"phf",
|
|
||||||
"phf_codegen",
|
|
||||||
"structopt",
|
"structopt",
|
||||||
"unicase",
|
"unicase",
|
||||||
]
|
]
|
||||||
|
@ -1646,7 +1563,7 @@ version = "0.8.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
|
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.2.3",
|
"getrandom",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1705,12 +1622,6 @@ dependencies = [
|
||||||
"winapi-util",
|
"winapi-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "wasi"
|
|
||||||
version = "0.9.0+wasi-snapshot-preview1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasi"
|
name = "wasi"
|
||||||
version = "0.10.2+wasi-snapshot-preview1"
|
version = "0.10.2+wasi-snapshot-preview1"
|
||||||
|
|
|
@ -15,6 +15,6 @@ azure-devops = { project = "crate-ci", pipeline = "typos" }
|
||||||
codecov = { repository = "crate-ci/typos" }
|
codecov = { repository = "crate-ci/typos" }
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
phf = { version = "0.8", features = ["unicase"] }
|
|
||||||
unicase = "2.5"
|
unicase = "2.5"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
dictgen = { version = "0.1", path = "../dictgen" }
|
||||||
|
|
|
@ -18,10 +18,9 @@ codecov = { repository = "crate-ci/typos" }
|
||||||
disable-release = true
|
disable-release = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
phf = { version = "0.8", features = ["unicase"] }
|
|
||||||
phf_codegen = "0.8"
|
|
||||||
csv = "1.1"
|
csv = "1.1"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
unicase = "2.5"
|
unicase = "2.5"
|
||||||
codegenrs = "1.0"
|
codegenrs = "1.0"
|
||||||
structopt = "0.3"
|
structopt = "0.3"
|
||||||
|
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||||
|
|
|
@ -11,17 +11,7 @@ fn generate<W: std::io::Write>(file: &mut W) {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
|
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
|
||||||
writeln!(file).unwrap();
|
writeln!(file).unwrap();
|
||||||
writeln!(file, "use unicase::UniCase;").unwrap();
|
|
||||||
|
|
||||||
let mut smallest = usize::MAX;
|
|
||||||
let mut largest = usize::MIN;
|
|
||||||
|
|
||||||
writeln!(
|
|
||||||
file,
|
|
||||||
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [&'static str]> = "
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let mut builder = phf_codegen::Map::new();
|
|
||||||
let records: Vec<_> = csv::ReaderBuilder::new()
|
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||||
.has_headers(false)
|
.has_headers(false)
|
||||||
.flexible(true)
|
.flexible(true)
|
||||||
|
@ -29,25 +19,20 @@ fn generate<W: std::io::Write>(file: &mut W) {
|
||||||
.records()
|
.records()
|
||||||
.map(|r| r.unwrap())
|
.map(|r| r.unwrap())
|
||||||
.collect();
|
.collect();
|
||||||
for record in &records {
|
dictgen::generate_trie(
|
||||||
let mut record_fields = record.iter();
|
|
||||||
let key = record_fields.next().unwrap();
|
|
||||||
smallest = std::cmp::min(smallest, key.len());
|
|
||||||
largest = std::cmp::max(largest, key.len());
|
|
||||||
let value = format!(
|
|
||||||
"&[{}]",
|
|
||||||
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
|
|
||||||
);
|
|
||||||
builder.entry(unicase::UniCase::new(&record[0]), &value);
|
|
||||||
}
|
|
||||||
let codegenned = builder.build();
|
|
||||||
writeln!(file, "{}", codegenned).unwrap();
|
|
||||||
writeln!(file, ";").unwrap();
|
|
||||||
writeln!(file).unwrap();
|
|
||||||
writeln!(
|
|
||||||
file,
|
file,
|
||||||
"pub const WORD_RANGE: std::ops::RangeInclusive<usize> = {}..={};",
|
"WORD",
|
||||||
smallest, largest
|
"&'static [&'static str]",
|
||||||
|
records.iter().map(|record| {
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{}""#, field)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
64,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
22
src/dict.rs
22
src/dict.rs
|
@ -55,11 +55,7 @@ impl BuiltIn {
|
||||||
impl BuiltIn {
|
impl BuiltIn {
|
||||||
// Not using `Status` to avoid the allocations
|
// Not using `Status` to avoid the allocations
|
||||||
fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
|
fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
|
||||||
if typos_dict::WORD_RANGE.contains(&word.len()) {
|
typos_dict::WORD_TRIE.find(&word).copied()
|
||||||
map_lookup(&typos_dict::WORD_DICTIONARY, word)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -175,22 +171,6 @@ impl typos::Dictionary for BuiltIn {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn map_lookup<V: Clone>(
|
|
||||||
map: &'static phf::Map<UniCase<&'static str>, V>,
|
|
||||||
key: unicase::UniCase<&str>,
|
|
||||||
) -> Option<V> {
|
|
||||||
// This transmute should be safe as `get` will not store the reference with
|
|
||||||
// the expanded lifetime. This is due to `Borrow` being overly strict and
|
|
||||||
// can't have an impl for `&'static str` to `Borrow<&'a str>`.
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
|
|
||||||
unsafe {
|
|
||||||
let key = ::std::mem::transmute::<_, unicase::UniCase<&'static str>>(key);
|
|
||||||
map.get(&key).cloned()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn case_correct(correction: &mut Cow<'_, str>, case: Case) {
|
fn case_correct(correction: &mut Cow<'_, str>, case: Case) {
|
||||||
match case {
|
match case {
|
||||||
Case::Lower | Case::None => (),
|
Case::Lower | Case::None => (),
|
||||||
|
|
Loading…
Reference in a new issue