From 150c5bfdc18c43c38243f8673336642b03c91d68 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 10 Nov 2020 20:23:38 -0600 Subject: [PATCH 1/4] perf: Hash faster for custom dicts If we have to hash for the custom dict, we might as well be fast about it. We do not need a cryptographically secure algorithm since the content is fixed for the user. Master: ``` real 0m26.675s user 0m33.683s sys 0m4.535s ``` With ahash: ``` real 0m23.993s user 0m30.800s sys 0m4.440s ``` --- Cargo.lock | 26 ++++++++++++++++++++++++-- Cargo.toml | 1 + src/dict.rs | 4 ++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 755731e..27f7a71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,15 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "ahash" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb6ec8807cd25b59e6b8100815afc73f54e294f1a425a2e555971969889a8f8" +dependencies = [ + "getrandom 0.2.0", + "lazy_static", +] + [[package]] name = "aho-corasick" version = "0.7.15" @@ -377,6 +387,17 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8025cf36f917e6a52cce185b7c7177689b838b7ec138364e50cc2277a56cf4" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "globset" version = "0.4.6" @@ -719,7 +740,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ - "getrandom", + "getrandom 0.1.15", "libc", "rand_chacha", "rand_core", @@ -743,7 +764,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ - "getrandom", + "getrandom 0.1.15", ] [[package]] @@ -1051,6 +1072,7 @@ dependencies = [ name = "typos-cli" version = "0.1.4" dependencies = [ + "ahash", "anyhow", "assert_fs", "bstr", diff --git a/Cargo.toml b/Cargo.toml index 168dffb..0ffd1af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ toml = "0.5" log = "0.4" env_logger = "0.8" bstr = "0.2" +ahash = "0.5.8" [dev-dependencies] assert_fs = "1.0" diff --git a/src/dict.rs b/src/dict.rs index f2ced5f..f3178e5 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -144,8 +144,8 @@ fn case_correct(correction: &mut Cow<'_, str>, case: Case) { } pub struct Override<'i, 'w, D> { - identifiers: HashMap<&'i str, Status<'i>>, - words: HashMap, Status<'w>>, + identifiers: HashMap<&'i str, Status<'i>, ahash::RandomState>, + words: HashMap, Status<'w>, ahash::RandomState>, inner: D, } From 18e31fa578407abc1c42a47dfac2572cccfdf82e Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 10 Nov 2020 20:19:12 -0600 Subject: [PATCH 2/4] perf: Avoid hashing withut custom dict `HashMap::get` (at least hashbrown) hashes before getting and doesn't check if dict is empty. For the custom dict, a common use case will have the dict be empty. Master: ``` real 0m26.675s user 0m33.683s sys 0m4.535s ``` Bypassing `HashMap::get` ``` real 0m16.415s user 0m14.519s sys 0m4.118s ``` On a moderately sized repo. --- src/dict.rs | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/dict.rs b/src/dict.rs index f3178e5..443a616 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -168,7 +168,7 @@ impl<'i, 'w, D: typos::Dictionary> Override<'i, 'w, D> { .collect(); } - pub fn interpret<'z, I: Iterator>( + fn interpret<'z, I: Iterator>( cases: I, ) -> impl Iterator)> { cases.map(|(typo, correction)| { @@ -186,19 +186,29 @@ impl<'i, 'w, D: typos::Dictionary> Override<'i, 'w, D> { impl<'i, 'w, D: typos::Dictionary> typos::Dictionary for Override<'i, 'w, D> { fn correct_ident<'s, 't>(&'s self, ident: typos::tokens::Identifier<'t>) -> Option> { - self.identifiers - .get(ident.token()) - .map(|c| c.borrow()) - .or_else(|| self.inner.correct_ident(ident)) + // Skip hashing if we can + if !self.identifiers.is_empty() { + self.identifiers + .get(ident.token()) + .map(|c| c.borrow()) + .or_else(|| self.inner.correct_ident(ident)) + } else { + None + } } fn correct_word<'s, 't>(&'s self, word: typos::tokens::Word<'t>) -> Option> { - let w = UniCase::new(word.token()); - // HACK: couldn't figure out the lifetime issue with replacing `cloned` with `borrow` - self.words - .get(&w) - .cloned() - .or_else(|| self.inner.correct_word(word)) + // Skip hashing if we can + if !self.words.is_empty() { + let w = UniCase::new(word.token()); + // HACK: couldn't figure out the lifetime issue with replacing `cloned` with `borrow` + self.words + .get(&w) + .cloned() + .or_else(|| self.inner.correct_word(word)) + } else { + None + } } } From beaa0f4091a71d09e5828b1e9091e297d28334c5 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 10 Nov 2020 20:45:57 -0600 Subject: [PATCH 3/4] perf(dict): Avoid hashing unknwon words Bypass hashing when we know (through str::len) that a word won't be in the dict. Master: ``` real 0m26.675s user 0m33.683s sys 0m4.535s ``` With this change: ``` real 0m24.060s user 0m31.559s sys 0m4.258s ``` --- crates/typos-dict/codegen/src/main.rs | 8 ++++++++ crates/typos-dict/src/dict_codegen.rs | 3 +++ src/dict.rs | 8 +++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/crates/typos-dict/codegen/src/main.rs b/crates/typos-dict/codegen/src/main.rs index f40e7c4..af0b5b2 100644 --- a/crates/typos-dict/codegen/src/main.rs +++ b/crates/typos-dict/codegen/src/main.rs @@ -13,6 +13,9 @@ fn generate(file: &mut W) { writeln!(file).unwrap(); writeln!(file, "use unicase::UniCase;").unwrap(); + let mut smallest = usize::MAX; + let mut largest = usize::MIN; + writeln!( file, "pub static WORD_DICTIONARY: phf::Map, &'static str> = " @@ -26,12 +29,17 @@ fn generate(file: &mut W) { .map(|r| r.unwrap()) .collect(); for record in &records { + smallest = std::cmp::min(smallest, record[0].len()); + largest = std::cmp::max(largest, record[0].len()); let value = format!(r#""{}""#, &record[1]); builder.entry(unicase::UniCase::new(&record[0]), &value); } let codegenned = builder.build(); writeln!(file, "{}", codegenned).unwrap(); writeln!(file, ";").unwrap(); + writeln!(file).unwrap(); + writeln!(file, "pub const WORD_MIN: usize = {};", smallest).unwrap(); + writeln!(file, "pub const WORD_MAX: usize = {};", largest).unwrap(); } #[derive(Debug, StructOpt)] diff --git a/crates/typos-dict/src/dict_codegen.rs b/crates/typos-dict/src/dict_codegen.rs index 0fee245..13008b5 100644 --- a/crates/typos-dict/src/dict_codegen.rs +++ b/crates/typos-dict/src/dict_codegen.rs @@ -33648,3 +33648,6 @@ pub static WORD_DICTIONARY: phf::Map, &'static st (UniCase::ascii("presumpton"), "presumption"), ]), }; + +pub const WORD_MIN: usize = 3; +pub const WORD_MAX: usize = 19; diff --git a/src/dict.rs b/src/dict.rs index 443a616..504bec1 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -44,7 +44,13 @@ impl BuiltIn { // Not using `Status` to avoid the allocations fn correct_with_dict(&self, word: &str) -> Option<&'static str> { - map_lookup(&typos_dict::WORD_DICTIONARY, word) + const WORD_RANGE: std::ops::RangeInclusive = + typos_dict::WORD_MIN..=typos_dict::WORD_MAX; + if WORD_RANGE.contains(&word.len()) { + map_lookup(&typos_dict::WORD_DICTIONARY, word) + } else { + None + } } fn correct_with_vars(&self, word: &str) -> Option> { From 6bdbd821e38cbe279615b602d8c8acee973e1293 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 10 Nov 2020 20:50:10 -0600 Subject: [PATCH 4/4] perf(dict): Avoid hashing unknwon words Bypass hashing when we know (through str::len) that a word won't be in the dict. Master: ``` real 0m26.675s user 0m33.683s sys 0m4.535s ``` With this change ``` real 0m24.432s user 0m32.492s sys 0m4.190s ``` --- crates/typos-vars/codegen/src/main.rs | 9 +++++++++ crates/typos-vars/src/vars_codegen.rs | 3 +++ src/dict.rs | 9 ++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/crates/typos-vars/codegen/src/main.rs b/crates/typos-vars/codegen/src/main.rs index c718ee8..981a85c 100644 --- a/crates/typos-vars/codegen/src/main.rs +++ b/crates/typos-vars/codegen/src/main.rs @@ -76,6 +76,9 @@ fn generate_variations(file: &mut W) { writeln!(file, "}}").unwrap(); writeln!(file).unwrap(); + let mut smallest = usize::MAX; + let mut largest = usize::MIN; + writeln!( file, "pub static VARS_DICTIONARY: phf::Map, &'static [(u8, &VariantsMap)]> = " @@ -92,11 +95,17 @@ fn generate_variations(file: &mut W) { referenced_symbols.extend(data.iter().map(|(s, _)| s)); let value = generate_link(&data); builder.entry(unicase::UniCase::new(word), &value); + smallest = std::cmp::min(smallest, word.len()); + largest = std::cmp::max(largest, word.len()); } let codegenned = builder.build(); writeln!(file, "{}", codegenned).unwrap(); writeln!(file, ";").unwrap(); + writeln!(file).unwrap(); + writeln!(file, "pub const WORD_MIN: usize = {};", smallest).unwrap(); + writeln!(file, "pub const WORD_MAX: usize = {};", largest).unwrap(); + for (symbol, entry) in entries.iter() { if !referenced_symbols.contains(symbol.as_str()) { continue; diff --git a/crates/typos-vars/src/vars_codegen.rs b/crates/typos-vars/src/vars_codegen.rs index 03ac102..e511b01 100644 --- a/crates/typos-vars/src/vars_codegen.rs +++ b/crates/typos-vars/src/vars_codegen.rs @@ -113081,6 +113081,9 @@ pub static VARS_DICTIONARY: phf::Map< ), ]), }; + +pub const WORD_MIN: usize = 2; +pub const WORD_MAX: usize = 24; pub(crate) static ENTRY_ABETTORS_7043394254318611656: VariantsMap = [&["abettors"], &["abetters"], &["abettors"], &["abetters"]]; diff --git a/src/dict.rs b/src/dict.rs index 504bec1..d0e0aa9 100644 --- a/src/dict.rs +++ b/src/dict.rs @@ -54,7 +54,14 @@ impl BuiltIn { } fn correct_with_vars(&self, word: &str) -> Option> { - map_lookup(&typos_vars::VARS_DICTIONARY, word).map(|variants| self.select_variant(variants)) + const WORD_RANGE: std::ops::RangeInclusive = + typos_vars::WORD_MIN..=typos_vars::WORD_MAX; + if WORD_RANGE.contains(&word.len()) { + map_lookup(&typos_vars::VARS_DICTIONARY, word) + .map(|variants| self.select_variant(variants)) + } else { + None + } } fn select_variant(