mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-25 16:09:03 -05:00
Merge pull request #1194 from epage/perf
fix(dictgen)!: Generate phf for map feature
This commit is contained in:
commit
b87cd87116
9 changed files with 180646 additions and 138185 deletions
12
Cargo.lock
generated
12
Cargo.lock
generated
|
@ -450,6 +450,7 @@ name = "dictgen"
|
|||
version = "0.2.11"
|
||||
dependencies = [
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"phf_shared",
|
||||
"unicase",
|
||||
]
|
||||
|
@ -884,6 +885,16 @@ dependencies = [
|
|||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.2"
|
||||
|
@ -1500,6 +1511,7 @@ dependencies = [
|
|||
"edit-distance",
|
||||
"indexmap",
|
||||
"itertools 0.13.0",
|
||||
"phf",
|
||||
"snapbox",
|
||||
"unicase",
|
||||
"varcon",
|
||||
|
|
|
@ -17,13 +17,14 @@ rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
|
|||
[features]
|
||||
default = ["std"]
|
||||
std = []
|
||||
codegen = ["std"]
|
||||
codegen = ["std", "dep:phf_codegen"]
|
||||
map = ["dep:phf", "dep:phf_shared"]
|
||||
|
||||
[dependencies]
|
||||
unicase = "2.7"
|
||||
phf = { version = "0.11", features = ["unicase"], optional = true }
|
||||
phf_shared = { version = "0.11", optional = true }
|
||||
phf_codegen = { version = "0.11", optional = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
|
103
crates/dictgen/src/insensitive.rs
Normal file
103
crates/dictgen/src/insensitive.rs
Normal file
|
@ -0,0 +1,103 @@
|
|||
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum InsensitiveStr<'s> {
|
||||
Unicode(&'s str),
|
||||
Ascii(&'s str),
|
||||
}
|
||||
|
||||
impl<'s> InsensitiveStr<'s> {
|
||||
pub fn convert(self) -> unicase::UniCase<&'s str> {
|
||||
match self {
|
||||
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
|
||||
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> &'s str {
|
||||
match self {
|
||||
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(self) -> bool {
|
||||
match self {
|
||||
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(self) -> usize {
|
||||
match self {
|
||||
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
|
||||
fn from(other: unicase::UniCase<&'s str>) -> Self {
|
||||
if other.is_ascii() {
|
||||
InsensitiveStr::Ascii(other.into_inner())
|
||||
} else {
|
||||
InsensitiveStr::Unicode(other.into_inner())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
|
||||
self.convert() == other.convert()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for InsensitiveStr<'_> {}
|
||||
|
||||
impl core::hash::Hash for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
|
||||
self.convert().hash(hasher);
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
core::fmt::Debug::fmt(self.into_inner(), fmt)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
core::fmt::Display::fmt(self.into_inner(), fmt)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "map")]
|
||||
impl phf_shared::PhfHash for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
|
||||
core::hash::Hash::hash(self, state);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "map")]
|
||||
impl phf_shared::FmtConst for InsensitiveStr<'_> {
|
||||
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self {
|
||||
InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
|
||||
InsensitiveStr::Unicode(_) => {
|
||||
f.write_str("dictgen::InsensitiveStr::Unicode(")?;
|
||||
}
|
||||
}
|
||||
|
||||
self.into_inner().fmt_const(f)?;
|
||||
f.write_str(")")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "map")]
|
||||
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a> {
|
||||
fn borrow(&self) -> &InsensitiveStr<'b> {
|
||||
self
|
||||
}
|
||||
}
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#[cfg(feature = "codegen")]
|
||||
mod gen;
|
||||
mod insensitive;
|
||||
#[cfg(feature = "map")]
|
||||
mod map;
|
||||
mod table;
|
||||
|
@ -11,6 +12,7 @@ mod trie;
|
|||
|
||||
#[cfg(feature = "codegen")]
|
||||
pub use gen::*;
|
||||
pub use insensitive::*;
|
||||
#[cfg(feature = "map")]
|
||||
pub use map::*;
|
||||
pub use table::*;
|
||||
|
|
|
@ -18,33 +18,35 @@ impl DictMapGen<'_> {
|
|||
|
||||
let mut smallest = usize::MAX;
|
||||
let mut largest = usize::MIN;
|
||||
|
||||
writeln!(
|
||||
file,
|
||||
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
|
||||
)?;
|
||||
writeln!(file, " keys: &[")?;
|
||||
for (key, _value) in data.iter() {
|
||||
let mut builder = phf_codegen::Map::new();
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(key, value)| {
|
||||
(
|
||||
if key.is_ascii() {
|
||||
crate::InsensitiveStr::Ascii(key)
|
||||
} else {
|
||||
crate::InsensitiveStr::Unicode(key)
|
||||
},
|
||||
value.to_string(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
for (key, value) in data.iter() {
|
||||
smallest = std::cmp::min(smallest, key.len());
|
||||
largest = std::cmp::max(largest, key.len());
|
||||
|
||||
let key = if key.is_ascii() {
|
||||
format!("dictgen::InsensitiveStr::Ascii({key:?})")
|
||||
} else {
|
||||
format!("dictgen::InsensitiveStr::Unicode({key:?})")
|
||||
};
|
||||
|
||||
writeln!(file, " {key},")?;
|
||||
builder.entry(key, value.as_str());
|
||||
}
|
||||
let builder = builder.build();
|
||||
if largest == 0 {
|
||||
smallest = 0;
|
||||
}
|
||||
writeln!(file, " ],")?;
|
||||
writeln!(file, " values: &[")?;
|
||||
for (_key, value) in data.iter() {
|
||||
writeln!(file, " {value},")?;
|
||||
}
|
||||
writeln!(file, " ],")?;
|
||||
|
||||
writeln!(
|
||||
file,
|
||||
"pub static {name}: dictgen::DictMap<{value_type}> = dictgen::DictMap {{"
|
||||
)?;
|
||||
writeln!(file, " map: {builder},")?;
|
||||
writeln!(file, " range: {smallest}..={largest},")?;
|
||||
writeln!(file, "}};")?;
|
||||
|
||||
|
@ -58,6 +60,7 @@ pub struct DictMap<V: 'static> {
|
|||
}
|
||||
|
||||
impl<V> DictMap<V> {
|
||||
#[inline]
|
||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
|
||||
if self.range.contains(&word.len()) {
|
||||
self.map.get(&(*word).into())
|
||||
|
@ -65,35 +68,4 @@ impl<V> DictMap<V> {
|
|||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&str>, &V)> + '_ {
|
||||
self.map.entries().map(|(k, v)| (k.convert(), v))
|
||||
}
|
||||
}
|
||||
|
||||
impl phf_shared::PhfHash for crate::InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
|
||||
core::hash::Hash::hash(self, state);
|
||||
}
|
||||
}
|
||||
|
||||
impl phf_shared::FmtConst for crate::InsensitiveStr<'_> {
|
||||
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self {
|
||||
crate::InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?,
|
||||
crate::InsensitiveStr::Unicode(_) => {
|
||||
f.write_str("dictgen::InsensitiveStr::Unicode(")?;
|
||||
}
|
||||
}
|
||||
|
||||
self.into_inner().fmt_const(f)?;
|
||||
f.write_str(")")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b, 'a: 'b> phf_shared::PhfBorrow<crate::InsensitiveStr<'b>> for crate::InsensitiveStr<'a> {
|
||||
fn borrow(&self) -> &crate::InsensitiveStr<'b> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,12 +53,13 @@ impl DictTableGen<'_> {
|
|||
}
|
||||
|
||||
pub struct DictTable<V: 'static> {
|
||||
pub keys: &'static [InsensitiveStr<'static>],
|
||||
pub keys: &'static [crate::InsensitiveStr<'static>],
|
||||
pub values: &'static [V],
|
||||
pub range: core::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<V> DictTable<V> {
|
||||
#[inline]
|
||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
||||
if self.range.contains(&word.len()) {
|
||||
self.keys
|
||||
|
@ -69,70 +70,4 @@ impl<V> DictTable<V> {
|
|||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = (unicase::UniCase<&'static str>, &'static V)> + '_ {
|
||||
(0..self.keys.len()).map(move |i| (self.keys[i].convert(), &self.values[i]))
|
||||
}
|
||||
}
|
||||
|
||||
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum InsensitiveStr<'s> {
|
||||
Unicode(&'s str),
|
||||
Ascii(&'s str),
|
||||
}
|
||||
|
||||
impl<'s> InsensitiveStr<'s> {
|
||||
pub fn convert(self) -> unicase::UniCase<&'s str> {
|
||||
match self {
|
||||
InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s),
|
||||
InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> &'s str {
|
||||
match self {
|
||||
InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> From<unicase::UniCase<&'s str>> for InsensitiveStr<'s> {
|
||||
fn from(other: unicase::UniCase<&'s str>) -> Self {
|
||||
if other.is_ascii() {
|
||||
InsensitiveStr::Ascii(other.into_inner())
|
||||
} else {
|
||||
InsensitiveStr::Unicode(other.into_inner())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn eq(&self, other: &InsensitiveStr<'s2>) -> bool {
|
||||
self.convert() == other.convert()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for InsensitiveStr<'_> {}
|
||||
|
||||
impl core::hash::Hash for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
|
||||
self.convert().hash(hasher);
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
core::fmt::Debug::fmt(self.into_inner(), fmt)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for InsensitiveStr<'_> {
|
||||
#[inline]
|
||||
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
core::fmt::Display::fmt(self.into_inner(), fmt)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,48 +32,50 @@ pub struct DictTrie<V: 'static> {
|
|||
}
|
||||
|
||||
impl<V> DictTrie<V> {
|
||||
#[inline]
|
||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
||||
if self.range.contains(&word.len()) {
|
||||
let bytes = word.as_bytes();
|
||||
if word.is_ascii() {
|
||||
if self.range.contains(&word.len()) {
|
||||
self.find_ascii(word.as_bytes())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
self.unicode.find(word)
|
||||
}
|
||||
}
|
||||
|
||||
let mut child = &self.root;
|
||||
for i in 0..bytes.len() {
|
||||
match child.children {
|
||||
DictTrieChild::Nested(n) => {
|
||||
let byte = bytes[i];
|
||||
let index = if byte.is_ascii_lowercase() {
|
||||
byte - b'a'
|
||||
} else if byte.is_ascii_uppercase() {
|
||||
byte - b'A'
|
||||
} else {
|
||||
return self.unicode.find(word);
|
||||
};
|
||||
debug_assert!(index < 26);
|
||||
if let Some(next) = n[index as usize].as_ref() {
|
||||
child = next;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
DictTrieChild::Flat(t) => {
|
||||
let remaining = &bytes[i..bytes.len()];
|
||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||
// safe.
|
||||
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
||||
// Reuse the prior ascii check, rather than doing it again
|
||||
let remaining = if word.is_ascii() {
|
||||
unicase::UniCase::ascii(remaining)
|
||||
} else {
|
||||
unicase::UniCase::unicode(remaining)
|
||||
};
|
||||
return t.find(&remaining);
|
||||
fn find_ascii(&self, word: &[u8]) -> Option<&'static V> {
|
||||
let mut child = &self.root;
|
||||
for i in 0..word.len() {
|
||||
match child.children {
|
||||
DictTrieChild::Nested(n) => {
|
||||
let byte = word[i];
|
||||
let index = if byte.is_ascii_lowercase() {
|
||||
byte - b'a'
|
||||
} else if byte.is_ascii_uppercase() {
|
||||
byte - b'A'
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
debug_assert!(index < 26);
|
||||
if let Some(next) = n[index as usize].as_ref() {
|
||||
child = next;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
DictTrieChild::Flat(t) => {
|
||||
let remaining = &word[i..word.len()];
|
||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||
// safe.
|
||||
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
||||
let remaining = unicase::UniCase::ascii(remaining);
|
||||
return t.find(&remaining);
|
||||
}
|
||||
}
|
||||
child.value.as_ref()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
child.value.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ varcon = { version = "^1.0", path = "../varcon" }
|
|||
snapbox = "0.6.5"
|
||||
indexmap = "2.2.6"
|
||||
divan = "0.1.16"
|
||||
phf = "0.11.2"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue