mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-11 09:11:39 -05:00
feat(dictgen): Add aho-corasick support
This commit is contained in:
parent
44cf2f8cf6
commit
7984d47095
9 changed files with 138282 additions and 3 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
@ -42,9 +42,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
@ -449,6 +449,7 @@ dependencies = [
|
|||
name = "dictgen"
|
||||
version = "0.2.11"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"phf_shared",
|
||||
|
|
|
@ -19,12 +19,14 @@ default = ["std"]
|
|||
std = []
|
||||
codegen = ["std", "dep:phf_codegen"]
|
||||
map = ["dep:phf", "dep:phf_shared"]
|
||||
aho-corasick = ["dep:aho-corasick"]
|
||||
|
||||
[dependencies]
|
||||
unicase = "2.7"
|
||||
phf = { version = "0.11", features = ["unicase"], optional = true }
|
||||
phf_shared = { version = "0.11", optional = true }
|
||||
phf_codegen = { version = "0.11", optional = true }
|
||||
aho-corasick = { version = "1.1.3", optional = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
|
112
crates/dictgen/src/aho_corasick.rs
Normal file
112
crates/dictgen/src/aho_corasick.rs
Normal file
|
@ -0,0 +1,112 @@
|
|||
pub use ::aho_corasick::automaton::Automaton;
|
||||
pub use ::aho_corasick::dfa::Builder;
|
||||
pub use ::aho_corasick::dfa::DFA;
|
||||
pub use ::aho_corasick::Anchored;
|
||||
pub use ::aho_corasick::Input;
|
||||
pub use ::aho_corasick::MatchKind;
|
||||
pub use ::aho_corasick::StartKind;
|
||||
|
||||
#[cfg(feature = "codegen")]
|
||||
pub struct AhoCorasickGen<'g> {
|
||||
pub(crate) gen: crate::DictGen<'g>,
|
||||
}
|
||||
|
||||
#[cfg(feature = "codegen")]
|
||||
impl AhoCorasickGen<'_> {
|
||||
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||
&self,
|
||||
file: &mut W,
|
||||
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut data: Vec<_> = data.collect();
|
||||
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||
|
||||
let name = self.gen.name;
|
||||
let value_type = self.gen.value_type;
|
||||
|
||||
writeln!(file, "pub struct {name} {{")?;
|
||||
writeln!(file, " dfa: dictgen::aho_corasick::DFA,")?;
|
||||
writeln!(file, " unicode: &'static dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, {value_type}>,")?;
|
||||
writeln!(file, "}}")?;
|
||||
writeln!(file)?;
|
||||
writeln!(file, "impl {name} {{")?;
|
||||
writeln!(file, " pub fn new() -> Self {{")?;
|
||||
writeln!(
|
||||
file,
|
||||
" static NEEDLES: &'static [&'static [u8]] = &["
|
||||
)?;
|
||||
for (key, _value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
|
||||
let key = key.as_ref();
|
||||
writeln!(file, " b{key:?},")?;
|
||||
}
|
||||
writeln!(file, " ];")?;
|
||||
writeln!(
|
||||
file,
|
||||
" let dfa = dictgen::aho_corasick::Builder::new()"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
" .match_kind(dictgen::aho_corasick::MatchKind::LeftmostLongest)"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
" .start_kind(dictgen::aho_corasick::StartKind::Anchored)"
|
||||
)?;
|
||||
writeln!(file, " .ascii_case_insensitive(true)")?;
|
||||
writeln!(file, " .build(NEEDLES)")?;
|
||||
writeln!(file, " .unwrap();")?;
|
||||
crate::DictGen::new()
|
||||
.name("UNICODE_TABLE")
|
||||
.value_type(value_type)
|
||||
.ordered_map()
|
||||
.write(
|
||||
file,
|
||||
data.iter()
|
||||
.filter(|(k, _)| !k.as_ref().is_ascii())
|
||||
.map(|(k, v)| (k.as_ref(), v)),
|
||||
)?;
|
||||
writeln!(file)?;
|
||||
writeln!(file, " Self {{")?;
|
||||
writeln!(file, " dfa,")?;
|
||||
writeln!(file, " unicode: &UNICODE_TABLE,")?;
|
||||
writeln!(file, " }}")?;
|
||||
writeln!(file, " }}")?;
|
||||
writeln!(file)?;
|
||||
writeln!(
|
||||
file,
|
||||
" pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static {value_type}> {{"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
" static PATTERNID_MAP: &'static [{value_type}] = &["
|
||||
)?;
|
||||
for (_key, value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
|
||||
writeln!(file, " {value},")?;
|
||||
}
|
||||
writeln!(file, " ];")?;
|
||||
writeln!(file, " if word.is_ascii() {{")?;
|
||||
writeln!(
|
||||
file,
|
||||
" use dictgen::aho_corasick::Automaton as _;"
|
||||
)?;
|
||||
writeln!(file, " let input = dictgen::aho_corasick::Input::new(word.into_inner().as_bytes()).anchored(dictgen::aho_corasick::Anchored::Yes);")?;
|
||||
writeln!(
|
||||
file,
|
||||
" let mat = self.dfa.try_find(&input).unwrap()?;"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
" if mat.end() == word.into_inner().len() {{"
|
||||
)?;
|
||||
writeln!(file, " return None;")?;
|
||||
writeln!(file, " }}")?;
|
||||
writeln!(file, " Some(&PATTERNID_MAP[mat.pattern()])")?;
|
||||
writeln!(file, " }} else {{")?;
|
||||
writeln!(file, " self.unicode.find(word)")?;
|
||||
writeln!(file, " }}")?;
|
||||
writeln!(file, " }}")?;
|
||||
writeln!(file, "}}")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -61,6 +61,11 @@ impl<'g> DictGen<'g> {
|
|||
pub fn r#match(self) -> crate::MatchGen<'g> {
|
||||
crate::MatchGen { gen: self }
|
||||
}
|
||||
|
||||
#[cfg(feature = "aho-corasick")]
|
||||
pub fn aho_corasick(self) -> crate::AhoCorasickGen<'g> {
|
||||
crate::AhoCorasickGen { gen: self }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DictGen<'static> {
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
#![warn(clippy::print_stderr)]
|
||||
#![warn(clippy::print_stdout)]
|
||||
|
||||
#[cfg(feature = "aho-corasick")]
|
||||
pub mod aho_corasick;
|
||||
#[cfg(feature = "codegen")]
|
||||
mod gen;
|
||||
mod insensitive;
|
||||
|
@ -12,6 +14,9 @@ mod r#match;
|
|||
mod ordered_map;
|
||||
mod trie;
|
||||
|
||||
#[cfg(feature = "aho-corasick")]
|
||||
#[cfg(feature = "codegen")]
|
||||
pub use aho_corasick::AhoCorasickGen;
|
||||
#[cfg(feature = "codegen")]
|
||||
pub use gen::*;
|
||||
pub use insensitive::*;
|
||||
|
|
|
@ -25,7 +25,7 @@ itertools = "0.13"
|
|||
edit-distance = "2.1"
|
||||
unicase = "2.7"
|
||||
codegenrs = "3.0"
|
||||
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map"] }
|
||||
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map", "aho-corasick"] }
|
||||
varcon = { version = "^1.0", path = "../varcon" }
|
||||
snapbox = "0.6.5"
|
||||
indexmap = "2.2.6"
|
||||
|
|
138084
crates/typos-dict/benches/benches/aho_corasick_codegen.rs
Normal file
138084
crates/typos-dict/benches/benches/aho_corasick_codegen.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,11 +1,24 @@
|
|||
#![allow(clippy::wildcard_imports)]
|
||||
#![allow(dead_code)]
|
||||
|
||||
mod aho_corasick_codegen;
|
||||
mod cased_map_codegen;
|
||||
mod map_codegen;
|
||||
mod ordered_map_codegen;
|
||||
mod trie_codegen;
|
||||
|
||||
static AHO_CORASICK: std::sync::LazyLock<aho_corasick_codegen::Word> =
|
||||
std::sync::LazyLock::new(aho_corasick_codegen::Word::new);
|
||||
|
||||
mod new {
|
||||
use super::*;
|
||||
|
||||
#[divan::bench]
|
||||
fn aho_corasick() -> aho_corasick_codegen::Word {
|
||||
aho_corasick_codegen::Word::new()
|
||||
}
|
||||
}
|
||||
|
||||
mod miss {
|
||||
use super::*;
|
||||
|
||||
|
@ -30,6 +43,11 @@ mod miss {
|
|||
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||
ordered_map_codegen::WORD.find(&word)
|
||||
}
|
||||
|
||||
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
|
||||
fn aho_corasick(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||
AHO_CORASICK.find(&word)
|
||||
}
|
||||
}
|
||||
|
||||
mod hit {
|
||||
|
@ -56,6 +74,11 @@ mod hit {
|
|||
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||
ordered_map_codegen::WORD.find(&word)
|
||||
}
|
||||
|
||||
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
|
||||
fn aho_corasick(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||
AHO_CORASICK.find(&word)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
|
|
@ -38,6 +38,15 @@ fn codegen() {
|
|||
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
|
||||
);
|
||||
|
||||
let mut aho_corasick_content = vec![];
|
||||
generate_aho_corasick(&mut aho_corasick_content, "Word", DICT);
|
||||
let aho_corasick_content = String::from_utf8(aho_corasick_content).unwrap();
|
||||
let aho_corasick_content = codegenrs::rustfmt(&aho_corasick_content, None).unwrap();
|
||||
snapbox::assert_data_eq!(
|
||||
&aho_corasick_content,
|
||||
snapbox::file!["../benches/benches/aho_corasick_codegen.rs"].raw()
|
||||
);
|
||||
|
||||
snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
|
||||
}
|
||||
|
||||
|
@ -256,3 +265,41 @@ fn generate_ordered_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]
|
|||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn generate_aho_corasick<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||
writeln!(
|
||||
file,
|
||||
"// This file is @generated by {}",
|
||||
file!().replace('\\', "/")
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
|
||||
writeln!(file, "#![allow(clippy::redundant_static_lifetimes)]",).unwrap();
|
||||
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
|
||||
writeln!(file).unwrap();
|
||||
|
||||
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||
.has_headers(false)
|
||||
.flexible(true)
|
||||
.from_reader(dict)
|
||||
.records()
|
||||
.map(|r| r.unwrap())
|
||||
.collect();
|
||||
dictgen::DictGen::new()
|
||||
.name(name)
|
||||
.value_type("&'static [&'static str]")
|
||||
.aho_corasick()
|
||||
.write(
|
||||
file,
|
||||
records.iter().map(|record| {
|
||||
let mut record_fields = record.iter();
|
||||
let key = record_fields.next().unwrap();
|
||||
let value = format!(
|
||||
"&[{}]",
|
||||
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||
);
|
||||
(key, value)
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue