mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-11 09:11:39 -05:00
Merge pull request #1198 from epage/generic
perf(dict)!: Switch to PHF Map
This commit is contained in:
commit
44cf2f8cf6
20 changed files with 909056 additions and 704967 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1509,6 +1509,7 @@ dependencies = [
|
||||||
"dictgen",
|
"dictgen",
|
||||||
"divan",
|
"divan",
|
||||||
"edit-distance",
|
"edit-distance",
|
||||||
|
"heck",
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"itertools 0.13.0",
|
"itertools 0.13.0",
|
||||||
"phf",
|
"phf",
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
// This file is @generated crates/codespell-dict/tests/codegen.rs
|
// This file is @generated crates/codespell-dict/tests/codegen.rs
|
||||||
|
|
||||||
pub static WORD_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
pub static WORD_DICTIONARY: dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, &[&str]> =
|
||||||
|
dictgen::OrderedMap {
|
||||||
keys: &[
|
keys: &[
|
||||||
dictgen::InsensitiveStr::Ascii("1nd"),
|
dictgen::InsensitiveStr::Ascii("1nd"),
|
||||||
dictgen::InsensitiveStr::Ascii("2rd"),
|
dictgen::InsensitiveStr::Ascii("2rd"),
|
||||||
|
@ -84256,4 +84257,4 @@ pub static WORD_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
||||||
&["contains"],
|
&["contains"],
|
||||||
],
|
],
|
||||||
range: 2..=34,
|
range: 2..=34,
|
||||||
};
|
};
|
||||||
|
|
|
@ -36,11 +36,19 @@ impl<'g> DictGen<'g> {
|
||||||
|
|
||||||
#[cfg(feature = "map")]
|
#[cfg(feature = "map")]
|
||||||
pub fn map(self) -> crate::MapGen<'g> {
|
pub fn map(self) -> crate::MapGen<'g> {
|
||||||
crate::MapGen { gen: self }
|
crate::MapGen {
|
||||||
|
gen: self,
|
||||||
|
unicode: true,
|
||||||
|
unicase: true,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ordered_map(self) -> crate::OrderedMapGen<'g> {
|
pub fn ordered_map(self) -> crate::OrderedMapGen<'g> {
|
||||||
crate::OrderedMapGen { gen: self }
|
crate::OrderedMapGen {
|
||||||
|
gen: self,
|
||||||
|
unicode: true,
|
||||||
|
unicase: true,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn trie(self) -> crate::TrieGen<'g> {
|
pub fn trie(self) -> crate::TrieGen<'g> {
|
||||||
|
@ -49,6 +57,10 @@ impl<'g> DictGen<'g> {
|
||||||
limit: 64,
|
limit: 64,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn r#match(self) -> crate::MatchGen<'g> {
|
||||||
|
crate::MatchGen { gen: self }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for DictGen<'static> {
|
impl Default for DictGen<'static> {
|
||||||
|
|
|
@ -51,6 +51,18 @@ impl<'s2> PartialEq<InsensitiveStr<'s2>> for InsensitiveStr<'_> {
|
||||||
|
|
||||||
impl Eq for InsensitiveStr<'_> {}
|
impl Eq for InsensitiveStr<'_> {}
|
||||||
|
|
||||||
|
impl PartialOrd for InsensitiveStr<'_> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for InsensitiveStr<'_> {
|
||||||
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||||
|
self.convert().cmp(&other.convert())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl core::hash::Hash for InsensitiveStr<'_> {
|
impl core::hash::Hash for InsensitiveStr<'_> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
|
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
|
||||||
|
@ -101,3 +113,97 @@ impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveStr<'b>> for InsensitiveStr<'a
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `UniCase` look-alike that avoids const-fn so large tables don't OOM
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub struct InsensitiveAscii<'s>(pub &'s str);
|
||||||
|
|
||||||
|
impl<'s> InsensitiveAscii<'s> {
|
||||||
|
pub fn convert(self) -> unicase::Ascii<&'s str> {
|
||||||
|
unicase::Ascii::new(self.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_inner(self) -> &'s str {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(self) -> bool {
|
||||||
|
self.0.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(self) -> usize {
|
||||||
|
self.0.len()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> From<unicase::Ascii<&'s str>> for InsensitiveAscii<'s> {
|
||||||
|
fn from(other: unicase::Ascii<&'s str>) -> Self {
|
||||||
|
Self(other.into_inner())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s2> PartialEq<InsensitiveAscii<'s2>> for InsensitiveAscii<'_> {
|
||||||
|
#[inline]
|
||||||
|
fn eq(&self, other: &InsensitiveAscii<'s2>) -> bool {
|
||||||
|
self.convert() == other.convert()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for InsensitiveAscii<'_> {}
|
||||||
|
|
||||||
|
impl PartialOrd for InsensitiveAscii<'_> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for InsensitiveAscii<'_> {
|
||||||
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||||
|
self.convert().cmp(&other.convert())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl core::hash::Hash for InsensitiveAscii<'_> {
|
||||||
|
#[inline]
|
||||||
|
fn hash<H: core::hash::Hasher>(&self, hasher: &mut H) {
|
||||||
|
self.convert().hash(hasher);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl core::fmt::Debug for InsensitiveAscii<'_> {
|
||||||
|
#[inline]
|
||||||
|
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||||
|
core::fmt::Debug::fmt(self.into_inner(), fmt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl core::fmt::Display for InsensitiveAscii<'_> {
|
||||||
|
#[inline]
|
||||||
|
fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||||
|
core::fmt::Display::fmt(self.into_inner(), fmt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "map")]
|
||||||
|
impl phf_shared::PhfHash for InsensitiveAscii<'_> {
|
||||||
|
#[inline]
|
||||||
|
fn phf_hash<H: core::hash::Hasher>(&self, state: &mut H) {
|
||||||
|
core::hash::Hash::hash(self, state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "map")]
|
||||||
|
impl phf_shared::FmtConst for InsensitiveAscii<'_> {
|
||||||
|
fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||||
|
f.write_str("dictgen::InsensitiveAscii(")?;
|
||||||
|
self.into_inner().fmt_const(f)?;
|
||||||
|
f.write_str(")")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "map")]
|
||||||
|
impl<'b, 'a: 'b> phf_shared::PhfBorrow<InsensitiveAscii<'b>> for InsensitiveAscii<'a> {
|
||||||
|
fn borrow(&self) -> &InsensitiveAscii<'b> {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -7,6 +7,8 @@ mod gen;
|
||||||
mod insensitive;
|
mod insensitive;
|
||||||
#[cfg(feature = "map")]
|
#[cfg(feature = "map")]
|
||||||
mod map;
|
mod map;
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
mod r#match;
|
||||||
mod ordered_map;
|
mod ordered_map;
|
||||||
mod trie;
|
mod trie;
|
||||||
|
|
||||||
|
@ -16,4 +18,6 @@ pub use insensitive::*;
|
||||||
#[cfg(feature = "map")]
|
#[cfg(feature = "map")]
|
||||||
pub use map::*;
|
pub use map::*;
|
||||||
pub use ordered_map::*;
|
pub use ordered_map::*;
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
pub use r#match::*;
|
||||||
pub use trie::*;
|
pub use trie::*;
|
||||||
|
|
|
@ -1,27 +1,57 @@
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
pub struct MapGen<'g> {
|
pub struct MapGen<'g> {
|
||||||
pub(crate) gen: crate::DictGen<'g>,
|
pub(crate) gen: crate::DictGen<'g>,
|
||||||
|
pub(crate) unicase: bool,
|
||||||
|
pub(crate) unicode: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
impl MapGen<'_> {
|
impl MapGen<'_> {
|
||||||
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
|
pub fn unicase(mut self, yes: bool) -> Self {
|
||||||
|
self.unicase = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn unicode(mut self, yes: bool) -> Self {
|
||||||
|
self.unicode = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||||
&self,
|
&self,
|
||||||
file: &mut W,
|
file: &mut W,
|
||||||
data: impl Iterator<Item = (&'d str, V)>,
|
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
let mut data: Vec<_> = data.collect();
|
let mut data: Vec<_> = data.collect();
|
||||||
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
|
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||||
|
|
||||||
let name = self.gen.name;
|
let name = self.gen.name;
|
||||||
|
let key_type = self.key_type();
|
||||||
let value_type = self.gen.value_type;
|
let value_type = self.gen.value_type;
|
||||||
|
|
||||||
let mut smallest = usize::MAX;
|
let mut smallest = usize::MAX;
|
||||||
let mut largest = usize::MIN;
|
let mut largest = usize::MIN;
|
||||||
|
for (key, _) in data.iter() {
|
||||||
|
let key = key.as_ref();
|
||||||
|
smallest = std::cmp::min(smallest, key.len());
|
||||||
|
largest = std::cmp::max(largest, key.len());
|
||||||
|
}
|
||||||
|
if largest == 0 {
|
||||||
|
smallest = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"pub static {name}: dictgen::Map<{key_type}, {value_type}> = dictgen::Map {{"
|
||||||
|
)?;
|
||||||
|
|
||||||
|
match (self.unicase, self.unicode) {
|
||||||
|
(true, true) => {
|
||||||
let mut builder = phf_codegen::Map::new();
|
let mut builder = phf_codegen::Map::new();
|
||||||
let data = data
|
let data = data
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(key, value)| {
|
.map(|(key, value)| {
|
||||||
|
let key = key.as_ref();
|
||||||
(
|
(
|
||||||
if key.is_ascii() {
|
if key.is_ascii() {
|
||||||
crate::InsensitiveStr::Ascii(key)
|
crate::InsensitiveStr::Ascii(key)
|
||||||
|
@ -33,33 +63,58 @@ impl MapGen<'_> {
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
for (key, value) in data.iter() {
|
for (key, value) in data.iter() {
|
||||||
smallest = std::cmp::min(smallest, key.len());
|
|
||||||
largest = std::cmp::max(largest, key.len());
|
|
||||||
builder.entry(key, value.as_str());
|
builder.entry(key, value.as_str());
|
||||||
}
|
}
|
||||||
let builder = builder.build();
|
let builder = builder.build();
|
||||||
if largest == 0 {
|
writeln!(file, " map: {builder},")?;
|
||||||
smallest = 0;
|
}
|
||||||
|
(true, false) => {
|
||||||
|
let mut builder = phf_codegen::Map::new();
|
||||||
|
let data = data
|
||||||
|
.iter()
|
||||||
|
.map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string()))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
for (key, value) in data.iter() {
|
||||||
|
builder.entry(key, value.as_str());
|
||||||
|
}
|
||||||
|
let builder = builder.build();
|
||||||
|
writeln!(file, " map: {builder},")?;
|
||||||
|
}
|
||||||
|
(false, _) => {
|
||||||
|
let mut builder = phf_codegen::Map::new();
|
||||||
|
let data = data
|
||||||
|
.iter()
|
||||||
|
.map(|(key, value)| (key, value.to_string()))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
for (key, value) in data.iter() {
|
||||||
|
builder.entry(key.as_ref(), value.as_str());
|
||||||
|
}
|
||||||
|
let builder = builder.build();
|
||||||
|
writeln!(file, " map: {builder},")?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
writeln!(
|
|
||||||
file,
|
|
||||||
"pub static {name}: dictgen::Map<{value_type}> = dictgen::Map {{"
|
|
||||||
)?;
|
|
||||||
writeln!(file, " map: {builder},")?;
|
|
||||||
writeln!(file, " range: {smallest}..={largest},")?;
|
writeln!(file, " range: {smallest}..={largest},")?;
|
||||||
writeln!(file, "}};")?;
|
writeln!(file, "}};")?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn key_type(&self) -> &'static str {
|
||||||
|
match (self.unicase, self.unicode) {
|
||||||
|
(true, true) => "dictgen::InsensitiveStr<'static>",
|
||||||
|
(true, false) => "dictgen::InsensitiveAscii<'static>",
|
||||||
|
(false, _) => "&'static str",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Map<V: 'static> {
|
pub struct Map<K: 'static, V: 'static> {
|
||||||
pub map: phf::Map<crate::InsensitiveStr<'static>, V>,
|
pub map: phf::Map<K, V>,
|
||||||
pub range: std::ops::RangeInclusive<usize>,
|
pub range: std::ops::RangeInclusive<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<V> Map<V> {
|
impl<V> Map<crate::InsensitiveStr<'_>, V> {
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
|
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> {
|
||||||
if self.range.contains(&word.len()) {
|
if self.range.contains(&word.len()) {
|
||||||
|
@ -69,3 +124,25 @@ impl<V> Map<V> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<V> Map<crate::InsensitiveAscii<'_>, V> {
|
||||||
|
#[inline]
|
||||||
|
pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&V> {
|
||||||
|
if self.range.contains(&word.len()) {
|
||||||
|
self.map.get(&(*word).into())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<V> Map<&str, V> {
|
||||||
|
#[inline]
|
||||||
|
pub fn find(&self, word: &'_ &str) -> Option<&V> {
|
||||||
|
if self.range.contains(&word.len()) {
|
||||||
|
self.map.get(word)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
37
crates/dictgen/src/match.rs
Normal file
37
crates/dictgen/src/match.rs
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
pub struct MatchGen<'g> {
|
||||||
|
pub(crate) gen: crate::DictGen<'g>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
impl MatchGen<'_> {
|
||||||
|
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||||
|
&self,
|
||||||
|
file: &mut W,
|
||||||
|
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let mut data: Vec<_> = data.collect();
|
||||||
|
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||||
|
|
||||||
|
let name = self.gen.name;
|
||||||
|
let value_type = self.gen.value_type;
|
||||||
|
|
||||||
|
writeln!(file, "pub struct {name};")?;
|
||||||
|
writeln!(file, "impl {name} {{")?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" pub fn find(&self, word: &&str) -> Option<&'static {value_type}> {{"
|
||||||
|
)?;
|
||||||
|
writeln!(file, " match *word {{")?;
|
||||||
|
for (key, value) in data.iter() {
|
||||||
|
let key = key.as_ref();
|
||||||
|
writeln!(file, " {key:?} => Some(&{value}.as_slice()),")?;
|
||||||
|
}
|
||||||
|
writeln!(file, " _ => None,")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file, "}}")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,19 +1,32 @@
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
pub struct OrderedMapGen<'g> {
|
pub struct OrderedMapGen<'g> {
|
||||||
pub(crate) gen: crate::DictGen<'g>,
|
pub(crate) gen: crate::DictGen<'g>,
|
||||||
|
pub(crate) unicase: bool,
|
||||||
|
pub(crate) unicode: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
impl OrderedMapGen<'_> {
|
impl OrderedMapGen<'_> {
|
||||||
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
|
pub fn unicase(mut self, yes: bool) -> Self {
|
||||||
|
self.unicase = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn unicode(mut self, yes: bool) -> Self {
|
||||||
|
self.unicode = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||||
&self,
|
&self,
|
||||||
file: &mut W,
|
file: &mut W,
|
||||||
data: impl Iterator<Item = (&'d str, V)>,
|
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
let mut data: Vec<_> = data.collect();
|
let mut data: Vec<_> = data.collect();
|
||||||
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
|
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||||
|
|
||||||
let name = self.gen.name;
|
let name = self.gen.name;
|
||||||
|
let key_type = self.key_type();
|
||||||
let value_type = self.gen.value_type;
|
let value_type = self.gen.value_type;
|
||||||
|
|
||||||
let mut smallest = usize::MAX;
|
let mut smallest = usize::MAX;
|
||||||
|
@ -21,18 +34,15 @@ impl OrderedMapGen<'_> {
|
||||||
|
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
"pub static {name}: dictgen::OrderedMap<{value_type}> = dictgen::OrderedMap {{"
|
"pub static {name}: dictgen::OrderedMap<{key_type}, {value_type}> = dictgen::OrderedMap {{"
|
||||||
)?;
|
)?;
|
||||||
writeln!(file, " keys: &[")?;
|
writeln!(file, " keys: &[")?;
|
||||||
for (key, _value) in data.iter() {
|
for (key, _value) in data.iter() {
|
||||||
|
let key = key.as_ref();
|
||||||
smallest = std::cmp::min(smallest, key.len());
|
smallest = std::cmp::min(smallest, key.len());
|
||||||
largest = std::cmp::max(largest, key.len());
|
largest = std::cmp::max(largest, key.len());
|
||||||
|
|
||||||
let key = if key.is_ascii() {
|
let key = self.key_new(key);
|
||||||
format!("dictgen::InsensitiveStr::Ascii({key:?})")
|
|
||||||
} else {
|
|
||||||
format!("dictgen::InsensitiveStr::Unicode({key:?})")
|
|
||||||
};
|
|
||||||
|
|
||||||
writeln!(file, " {key},")?;
|
writeln!(file, " {key},")?;
|
||||||
}
|
}
|
||||||
|
@ -50,15 +60,37 @@ impl OrderedMapGen<'_> {
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn key_type(&self) -> &'static str {
|
||||||
|
match (self.unicase, self.unicode) {
|
||||||
|
(true, true) => "dictgen::InsensitiveStr<'static>",
|
||||||
|
(true, false) => "dictgen::InsensitiveAscii<'static>",
|
||||||
|
(false, _) => "&'static str",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn key_new(&self, key: &str) -> String {
|
||||||
|
match (self.unicase, self.unicode) {
|
||||||
|
(true, true) => {
|
||||||
|
if key.is_ascii() {
|
||||||
|
format!("dictgen::InsensitiveStr::Ascii({key:?})")
|
||||||
|
} else {
|
||||||
|
format!("dictgen::InsensitiveStr::Unicode({key:?})")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(true, false) => format!("dictgen::InsensitiveAscii({key:?})"),
|
||||||
|
(false, _) => format!("{key:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct OrderedMap<V: 'static> {
|
pub struct OrderedMap<K: 'static, V: 'static> {
|
||||||
pub keys: &'static [crate::InsensitiveStr<'static>],
|
pub keys: &'static [K],
|
||||||
pub values: &'static [V],
|
pub values: &'static [V],
|
||||||
pub range: core::ops::RangeInclusive<usize>,
|
pub range: core::ops::RangeInclusive<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<V> OrderedMap<V> {
|
impl<V> OrderedMap<crate::InsensitiveStr<'_>, V> {
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
||||||
if self.range.contains(&word.len()) {
|
if self.range.contains(&word.len()) {
|
||||||
|
@ -71,3 +103,28 @@ impl<V> OrderedMap<V> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<V> OrderedMap<crate::InsensitiveAscii<'_>, V> {
|
||||||
|
#[inline]
|
||||||
|
pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&'static V> {
|
||||||
|
if self.range.contains(&word.len()) {
|
||||||
|
self.keys
|
||||||
|
.binary_search_by_key(word, |key| key.convert())
|
||||||
|
.map(|i| &self.values[i])
|
||||||
|
.ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<V> OrderedMap<&str, V> {
|
||||||
|
#[inline]
|
||||||
|
pub fn find(&self, word: &'_ &str) -> Option<&'static V> {
|
||||||
|
if self.range.contains(&word.len()) {
|
||||||
|
self.keys.binary_search(word).map(|i| &self.values[i]).ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -27,7 +27,7 @@ impl TrieGen<'_> {
|
||||||
|
|
||||||
pub struct Trie<V: 'static> {
|
pub struct Trie<V: 'static> {
|
||||||
pub root: &'static TrieNode<V>,
|
pub root: &'static TrieNode<V>,
|
||||||
pub unicode: &'static crate::OrderedMap<V>,
|
pub unicode: &'static crate::OrderedMap<crate::InsensitiveStr<'static>, V>,
|
||||||
pub range: core::ops::RangeInclusive<usize>,
|
pub range: core::ops::RangeInclusive<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ impl<V> Trie<V> {
|
||||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||||
// safe.
|
// safe.
|
||||||
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
||||||
let remaining = unicase::UniCase::ascii(remaining);
|
let remaining = unicase::Ascii::new(remaining);
|
||||||
return t.find(&remaining);
|
return t.find(&remaining);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -91,7 +91,7 @@ pub struct TrieNode<V: 'static> {
|
||||||
|
|
||||||
pub enum TrieChild<V: 'static> {
|
pub enum TrieChild<V: 'static> {
|
||||||
Nested(&'static [Option<&'static TrieNode<V>>; 26]),
|
Nested(&'static [Option<&'static TrieNode<V>>; 26]),
|
||||||
Flat(&'static crate::OrderedMap<V>),
|
Flat(&'static crate::OrderedMap<crate::InsensitiveAscii<'static>, V>),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
|
@ -179,6 +179,7 @@ mod codegen {
|
||||||
.name(&children_name)
|
.name(&children_name)
|
||||||
.value_type(value_type)
|
.value_type(value_type)
|
||||||
.ordered_map()
|
.ordered_map()
|
||||||
|
.unicode(false)
|
||||||
.write(file, table_input)?;
|
.write(file, table_input)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
// This file is @generated by crates/misspell-dict/tests/codegen.rs
|
// This file is @generated by crates/misspell-dict/tests/codegen.rs
|
||||||
|
|
||||||
pub static MAIN_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
pub static MAIN_DICTIONARY: dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, &[&str]> =
|
||||||
|
dictgen::OrderedMap {
|
||||||
keys: &[
|
keys: &[
|
||||||
dictgen::InsensitiveStr::Ascii("abandenment"),
|
dictgen::InsensitiveStr::Ascii("abandenment"),
|
||||||
dictgen::InsensitiveStr::Ascii("abandining"),
|
dictgen::InsensitiveStr::Ascii("abandining"),
|
||||||
|
@ -56100,8 +56101,9 @@ pub static MAIN_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
||||||
&["zionists"],
|
&["zionists"],
|
||||||
],
|
],
|
||||||
range: 3..=19,
|
range: 3..=19,
|
||||||
};
|
};
|
||||||
pub static AMERICAN_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
pub static AMERICAN_DICTIONARY: dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, &[&str]> =
|
||||||
|
dictgen::OrderedMap {
|
||||||
keys: &[
|
keys: &[
|
||||||
dictgen::InsensitiveStr::Ascii("accessorise"),
|
dictgen::InsensitiveStr::Ascii("accessorise"),
|
||||||
dictgen::InsensitiveStr::Ascii("accessorised"),
|
dictgen::InsensitiveStr::Ascii("accessorised"),
|
||||||
|
@ -59345,8 +59347,9 @@ pub static AMERICAN_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedM
|
||||||
&["yogurts"],
|
&["yogurts"],
|
||||||
],
|
],
|
||||||
range: 4..=20,
|
range: 4..=20,
|
||||||
};
|
};
|
||||||
pub static BRITISH_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
pub static BRITISH_DICTIONARY: dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, &[&str]> =
|
||||||
|
dictgen::OrderedMap {
|
||||||
keys: &[
|
keys: &[
|
||||||
dictgen::InsensitiveStr::Ascii("accessorize"),
|
dictgen::InsensitiveStr::Ascii("accessorize"),
|
||||||
dictgen::InsensitiveStr::Ascii("accessorized"),
|
dictgen::InsensitiveStr::Ascii("accessorized"),
|
||||||
|
@ -62306,4 +62309,4 @@ pub static BRITISH_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMa
|
||||||
&["yodelling"],
|
&["yodelling"],
|
||||||
],
|
],
|
||||||
range: 4..=20,
|
range: 4..=20,
|
||||||
};
|
};
|
||||||
|
|
|
@ -16,7 +16,8 @@ all-features = true
|
||||||
rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
|
rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
dictgen = { version = "^0.2", path = "../dictgen" }
|
phf = "0.11.2"
|
||||||
|
dictgen = { version = "^0.2", path = "../dictgen", features = ["map"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
csv = "1.3"
|
csv = "1.3"
|
||||||
|
@ -29,7 +30,7 @@ varcon = { version = "^1.0", path = "../varcon" }
|
||||||
snapbox = "0.6.5"
|
snapbox = "0.6.5"
|
||||||
indexmap = "2.2.6"
|
indexmap = "2.2.6"
|
||||||
divan = "0.1.16"
|
divan = "0.1.16"
|
||||||
phf = "0.11.2"
|
heck = "0.5.0"
|
||||||
|
|
||||||
[lints]
|
[lints]
|
||||||
workspace = true
|
workspace = true
|
||||||
|
|
248866
crates/typos-dict/benches/benches/cased_map_codegen.rs
Normal file
248866
crates/typos-dict/benches/benches/cased_map_codegen.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,7 @@
|
||||||
#![allow(clippy::wildcard_imports)]
|
#![allow(clippy::wildcard_imports)]
|
||||||
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
mod cased_map_codegen;
|
||||||
mod map_codegen;
|
mod map_codegen;
|
||||||
mod ordered_map_codegen;
|
mod ordered_map_codegen;
|
||||||
mod trie_codegen;
|
mod trie_codegen;
|
||||||
|
@ -9,6 +11,11 @@ mod miss {
|
||||||
|
|
||||||
const MISS: &str = "finalizes";
|
const MISS: &str = "finalizes";
|
||||||
|
|
||||||
|
#[divan::bench(args = [MISS])]
|
||||||
|
fn cased_map(word: &str) -> Option<&'static &[&str]> {
|
||||||
|
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
|
||||||
|
}
|
||||||
|
|
||||||
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
|
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
|
||||||
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
map_codegen::WORD.find(&word)
|
map_codegen::WORD.find(&word)
|
||||||
|
@ -30,6 +37,11 @@ mod hit {
|
||||||
|
|
||||||
const HIT: &str = "finallizes";
|
const HIT: &str = "finallizes";
|
||||||
|
|
||||||
|
#[divan::bench(args = [HIT])]
|
||||||
|
fn cased_map(word: &str) -> Option<&'static &[&str]> {
|
||||||
|
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
|
||||||
|
}
|
||||||
|
|
||||||
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
|
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
|
||||||
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
map_codegen::WORD.find(&word)
|
map_codegen::WORD.find(&word)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
#![allow(clippy::unreadable_literal)]
|
#![allow(clippy::unreadable_literal)]
|
||||||
#![allow(unreachable_pub)]
|
#![allow(unreachable_pub)]
|
||||||
|
|
||||||
pub static WORD: dictgen::Map<&[&str]> = dictgen::Map {
|
pub static WORD: dictgen::Map<dictgen::InsensitiveStr<'static>, &[&str]> = dictgen::Map {
|
||||||
map: ::phf::Map {
|
map: ::phf::Map {
|
||||||
key: 12913932095322966823,
|
key: 12913932095322966823,
|
||||||
disps: &[
|
disps: &[
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
#![allow(clippy::unreadable_literal)]
|
#![allow(clippy::unreadable_literal)]
|
||||||
#![allow(unreachable_pub)]
|
#![allow(unreachable_pub)]
|
||||||
|
|
||||||
pub static WORD: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
pub static WORD: dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, &[&str]> =
|
||||||
|
dictgen::OrderedMap {
|
||||||
keys: &[
|
keys: &[
|
||||||
dictgen::InsensitiveStr::Ascii("aaccess"),
|
dictgen::InsensitiveStr::Ascii("aaccess"),
|
||||||
dictgen::InsensitiveStr::Ascii("aaccessibility"),
|
dictgen::InsensitiveStr::Ascii("aaccessibility"),
|
||||||
|
@ -138034,4 +138035,4 @@ pub static WORD: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
||||||
&["contains"],
|
&["contains"],
|
||||||
],
|
],
|
||||||
range: 2..=34,
|
range: 2..=34,
|
||||||
};
|
};
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -20,6 +20,15 @@ fn codegen() {
|
||||||
snapbox::file!["../benches/benches/map_codegen.rs"].raw()
|
snapbox::file!["../benches/benches/map_codegen.rs"].raw()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut cased_map_content = vec![];
|
||||||
|
generate_cased_map(&mut cased_map_content, "WORD", DICT);
|
||||||
|
let cased_map_content = String::from_utf8(cased_map_content).unwrap();
|
||||||
|
let cased_map_content = codegenrs::rustfmt(&cased_map_content, None).unwrap();
|
||||||
|
snapbox::assert_data_eq!(
|
||||||
|
&cased_map_content,
|
||||||
|
snapbox::file!["../benches/benches/cased_map_codegen.rs"].raw()
|
||||||
|
);
|
||||||
|
|
||||||
let mut ordered_map_content = vec![];
|
let mut ordered_map_content = vec![];
|
||||||
generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
|
generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
|
||||||
let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
|
let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
|
||||||
|
@ -29,10 +38,7 @@ fn codegen() {
|
||||||
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
|
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
|
||||||
);
|
);
|
||||||
|
|
||||||
snapbox::assert_data_eq!(
|
snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
|
||||||
&trie_content,
|
|
||||||
snapbox::file!["../src/word_codegen.rs"].raw()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
|
@ -72,6 +78,111 @@ fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn generate_cased_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"// This file is @generated by {}",
|
||||||
|
file!().replace('\\', "/")
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
|
||||||
|
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
|
||||||
|
writeln!(file).unwrap();
|
||||||
|
|
||||||
|
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||||
|
.has_headers(false)
|
||||||
|
.flexible(true)
|
||||||
|
.from_reader(dict)
|
||||||
|
.records()
|
||||||
|
.map(|r| r.unwrap())
|
||||||
|
.collect();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_ASCII_LOWER"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.map()
|
||||||
|
.unicase(false)
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_ASCII_UPPER"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.map()
|
||||||
|
.unicase(false)
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
use heck::ToShoutySnakeCase;
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap().to_shouty_snake_case();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_ASCII_TITLE"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.map()
|
||||||
|
.unicase(false)
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
use heck::ToTitleCase;
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap().to_title_case();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_UNICODE"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.ordered_map()
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| !r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,7 @@
|
||||||
// This file is @generated by crates/wikipedia-dict/tests/codegen.rs
|
// This file is @generated by crates/wikipedia-dict/tests/codegen.rs
|
||||||
|
|
||||||
pub static WORD_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
pub static WORD_DICTIONARY: dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, &[&str]> =
|
||||||
|
dictgen::OrderedMap {
|
||||||
keys: &[
|
keys: &[
|
||||||
dictgen::InsensitiveStr::Ascii("abandonned"),
|
dictgen::InsensitiveStr::Ascii("abandonned"),
|
||||||
dictgen::InsensitiveStr::Ascii("abbout"),
|
dictgen::InsensitiveStr::Ascii("abbout"),
|
||||||
|
@ -8588,4 +8589,4 @@ pub static WORD_DICTIONARY: dictgen::OrderedMap<&[&str]> = dictgen::OrderedMap {
|
||||||
&["zebra"],
|
&["zebra"],
|
||||||
],
|
],
|
||||||
range: 3..=19,
|
range: 3..=19,
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue