typos/src/dict.rs
Ed Page bbbf985777 perf(dict): Switch varcon to a burst-trie
This cuts varcon lookup times in half but I still suspect slower than
phf.  Like with bsearch and unlike, the cost is consistent between hits
and misses.

At least this doesn't have the compile hit of PHF + unicase.  Maybe I
should experiment with integrating a non-const-fn variant of unicase
with PHF and give up on all of this extra complexity.
2021-06-30 21:03:57 -05:00

383 lines
12 KiB
Rust

use std::borrow::Cow;
use std::collections::HashMap;
use unicase::UniCase;
use typos::tokens::Case;
use typos::Status;
#[derive(Default)]
pub struct BuiltIn {
locale: Option<varcon_core::Category>,
}
impl BuiltIn {
pub const fn new(locale: crate::config::Locale) -> Self {
Self {
locale: locale.category(),
}
}
pub fn correct_ident<'s, 'w>(
&'s self,
_ident: typos::tokens::Identifier<'w>,
) -> Option<Status<'s>> {
None
}
pub fn correct_word<'s, 'w>(
&'s self,
word_token: typos::tokens::Word<'w>,
) -> Option<Status<'s>> {
if word_token.case() == typos::tokens::Case::None {
return None;
}
let word = word_token.token();
let word_case = unicase::UniCase::new(word);
let mut corrections = if let Some(corrections) = self.correct_with_dict(word_case) {
if corrections.is_empty() {
Status::Invalid
} else {
self.chain_with_vars(corrections)
}
} else {
self.correct_with_vars(word_case)?
};
corrections
.corrections_mut()
.for_each(|mut s| case_correct(&mut s, word_token.case()));
Some(corrections)
}
}
#[cfg(feature = "dict")]
impl BuiltIn {
// Not using `Status` to avoid the allocations
fn correct_with_dict(&self, word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
if typos_dict::WORD_RANGE.contains(&word.len()) {
map_lookup(&typos_dict::WORD_DICTIONARY, word)
} else {
None
}
}
}
#[cfg(not(feature = "dict"))]
impl BuiltIn {
fn correct_with_dict(&self, _word: unicase::UniCase<&str>) -> Option<&'static [&'static str]> {
None
}
}
#[cfg(feature = "vars")]
impl BuiltIn {
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Status<'static> {
if self.is_vars_enabled() {
let mut chained: Vec<_> = corrections
.iter()
.flat_map(|c| match self.correct_with_vars(unicase::UniCase::new(c)) {
Some(Status::Valid) | None => vec![Cow::Borrowed(*c)],
Some(Status::Corrections(vars)) => vars,
Some(Status::Invalid) => {
unreachable!("correct_with_vars should always have valid suggestions")
}
})
.collect();
if chained.len() != 1 {
chained.sort_unstable();
chained.dedup();
}
debug_assert!(!chained.is_empty());
Status::Corrections(chained)
} else {
Status::Corrections(corrections.iter().map(|c| Cow::Borrowed(*c)).collect())
}
}
fn correct_with_vars(&self, word: unicase::UniCase<&str>) -> Option<Status<'static>> {
if self.is_vars_enabled() {
typos_vars::VARS_TRIE
.find(&word)
.map(|variants| self.select_variant(variants))
} else {
None
}
}
fn is_vars_enabled(&self) -> bool {
#![allow(clippy::assertions_on_constants)]
debug_assert!(typos_vars::NO_INVALID);
self.locale.is_some()
}
fn select_variant(
&self,
vars: &'static [(u8, &'static typos_vars::VariantsMap)],
) -> Status<'static> {
let var = vars[0];
let var_categories = unsafe {
// Code-genned from a checked category-set, so known to be safe
typos_vars::CategorySet::from_bits_unchecked(var.0)
};
if let Some(locale) = self.locale {
if var_categories.contains(locale) {
// Already valid for the current locale.
Status::Valid
} else {
Status::Corrections(
typos_vars::corrections(locale, *var.1)
.iter()
.copied()
.map(Cow::Borrowed)
.collect(),
)
}
} else {
// All locales are valid
if var_categories.is_empty() {
// But the word is never valid.
let mut unique: Vec<_> = var
.1
.iter()
.flat_map(|v| v.iter())
.copied()
.map(Cow::Borrowed)
.collect();
unique.sort_unstable();
unique.dedup();
Status::Corrections(unique)
} else {
Status::Valid
}
}
}
}
#[cfg(not(feature = "vars"))]
impl BuiltIn {
fn chain_with_vars(&self, corrections: &'static [&'static str]) -> Status<'static> {
Status::Corrections(corrections.iter().map(|c| Cow::Borrowed(*c)).collect())
}
fn correct_with_vars(&self, _word: unicase::UniCase<&str>) -> Option<Status<'static>> {
None
}
}
impl typos::Dictionary for BuiltIn {
fn correct_ident<'s, 'w>(&'s self, ident: typos::tokens::Identifier<'w>) -> Option<Status<'s>> {
BuiltIn::correct_ident(self, ident)
}
fn correct_word<'s, 'w>(&'s self, word: typos::tokens::Word<'w>) -> Option<Status<'s>> {
BuiltIn::correct_word(self, word)
}
}
fn map_lookup<V: Clone>(
map: &'static phf::Map<UniCase<&'static str>, V>,
key: unicase::UniCase<&str>,
) -> Option<V> {
// This transmute should be safe as `get` will not store the reference with
// the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`.
//
//
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
unsafe {
let key = ::std::mem::transmute::<_, unicase::UniCase<&'static str>>(key);
map.get(&key).cloned()
}
}
fn case_correct(correction: &mut Cow<'_, str>, case: Case) {
match case {
Case::Lower | Case::None => (),
Case::Title => match correction {
Cow::Borrowed(s) => {
let mut s = String::from(*s);
s[0..1].make_ascii_uppercase();
*correction = s.into();
}
Cow::Owned(s) => {
s[0..1].make_ascii_uppercase();
}
},
Case::Upper => match correction {
Cow::Borrowed(s) => {
let mut s = String::from(*s);
s.make_ascii_uppercase();
*correction = s.into();
}
Cow::Owned(s) => {
s.make_ascii_uppercase();
}
},
}
}
pub struct Override<'i, 'w, D> {
identifiers: HashMap<&'i str, Status<'i>, ahash::RandomState>,
words: HashMap<unicase::UniCase<&'w str>, Status<'w>, ahash::RandomState>,
inner: D,
}
impl<'i, 'w, D: typos::Dictionary> Override<'i, 'w, D> {
pub fn new(inner: D) -> Self {
Self {
identifiers: Default::default(),
words: Default::default(),
inner,
}
}
pub fn identifiers<I: Iterator<Item = (&'i str, &'i str)>>(&mut self, identifiers: I) {
self.identifiers = Self::interpret(identifiers).collect();
}
pub fn words<I: Iterator<Item = (&'w str, &'w str)>>(&mut self, words: I) {
self.words = Self::interpret(words)
.map(|(k, v)| (UniCase::new(k), v))
.collect();
}
fn interpret<'z, I: Iterator<Item = (&'z str, &'z str)>>(
cases: I,
) -> impl Iterator<Item = (&'z str, Status<'z>)> {
cases.map(|(typo, correction)| {
let correction = if typo == correction {
Status::Valid
} else if correction.is_empty() {
Status::Invalid
} else {
Status::Corrections(vec![Cow::Borrowed(correction)])
};
(typo, correction)
})
}
}
impl<'i, 'w, D: typos::Dictionary> typos::Dictionary for Override<'i, 'w, D> {
fn correct_ident<'s, 't>(&'s self, ident: typos::tokens::Identifier<'t>) -> Option<Status<'s>> {
// Skip hashing if we can
if !self.identifiers.is_empty() {
self.identifiers
.get(ident.token())
.map(|c| c.borrow())
.or_else(|| self.inner.correct_ident(ident))
} else {
None
}
}
fn correct_word<'s, 't>(&'s self, word: typos::tokens::Word<'t>) -> Option<Status<'s>> {
if word.case() == typos::tokens::Case::None {
return None;
}
// Skip hashing if we can
let custom = if !self.words.is_empty() {
let w = UniCase::new(word.token());
// HACK: couldn't figure out the lifetime issue with replacing `cloned` with `borrow`
self.words.get(&w).cloned()
} else {
None
};
custom.or_else(|| self.inner.correct_word(word))
}
}
#[cfg(test)]
mod test {
use super::*;
#[cfg(feature = "dict")]
#[test]
fn test_dict_correct() {
let dict = BuiltIn::new(crate::config::Locale::default());
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finallizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalizes".into()]))
);
}
#[cfg(feature = "vars")]
#[test]
fn test_varcon_no_locale() {
let dict = BuiltIn::new(crate::config::Locale::En);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(correction, None);
}
#[cfg(feature = "vars")]
#[test]
fn test_varcon_same_locale() {
let dict = BuiltIn::new(crate::config::Locale::EnUs);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(correction, Some(Status::Valid));
}
#[cfg(feature = "vars")]
#[test]
fn test_varcon_different_locale() {
let dict = BuiltIn::new(crate::config::Locale::EnGb);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finalizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalises".into()]))
);
}
#[cfg(all(feature = "dict", feature = "vars"))]
#[test]
fn test_dict_to_varcon() {
let dict = BuiltIn::new(crate::config::Locale::EnGb);
let correction = dict.correct_word(typos::tokens::Word::new_unchecked(
"finallizes",
typos::tokens::Case::Lower,
0,
));
assert_eq!(
correction,
Some(Status::Corrections(vec!["finalises".into()]))
);
}
#[test]
fn test_case_correct() {
let cases = [
("foo", Case::Lower, "foo"),
("foo", Case::None, "foo"),
("foo", Case::Title, "Foo"),
("foo", Case::Upper, "FOO"),
("fOo", Case::None, "fOo"),
];
for (correction, case, expected) in cases.iter() {
let mut actual = Cow::Borrowed(*correction);
case_correct(&mut actual, *case);
assert_eq!(*expected, actual);
let mut actual = Cow::Owned(String::from(*correction));
case_correct(&mut actual, *case);
assert_eq!(*expected, actual);
}
}
}