Merge pull request #1199 from epage/aho

feat(dictgen): Add aho-corasick support
This commit is contained in:
Ed Page 2024-12-31 08:23:17 -06:00 committed by GitHub
commit 1af44522a7
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: B5690EEEBB952194
9 changed files with 138282 additions and 3 deletions

5
Cargo.lock generated
View file

@ -42,9 +42,9 @@ dependencies = [
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "1.1.2" version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]
@ -449,6 +449,7 @@ dependencies = [
name = "dictgen" name = "dictgen"
version = "0.2.11" version = "0.2.11"
dependencies = [ dependencies = [
"aho-corasick",
"phf", "phf",
"phf_codegen", "phf_codegen",
"phf_shared", "phf_shared",

View file

@ -19,12 +19,14 @@ default = ["std"]
std = [] std = []
codegen = ["std", "dep:phf_codegen"] codegen = ["std", "dep:phf_codegen"]
map = ["dep:phf", "dep:phf_shared"] map = ["dep:phf", "dep:phf_shared"]
aho-corasick = ["dep:aho-corasick"]
[dependencies] [dependencies]
unicase = "2.7" unicase = "2.7"
phf = { version = "0.11", features = ["unicase"], optional = true } phf = { version = "0.11", features = ["unicase"], optional = true }
phf_shared = { version = "0.11", optional = true } phf_shared = { version = "0.11", optional = true }
phf_codegen = { version = "0.11", optional = true } phf_codegen = { version = "0.11", optional = true }
aho-corasick = { version = "1.1.3", optional = true }
[lints] [lints]
workspace = true workspace = true

View file

@ -0,0 +1,112 @@
pub use ::aho_corasick::automaton::Automaton;
pub use ::aho_corasick::dfa::Builder;
pub use ::aho_corasick::dfa::DFA;
pub use ::aho_corasick::Anchored;
pub use ::aho_corasick::Input;
pub use ::aho_corasick::MatchKind;
pub use ::aho_corasick::StartKind;
#[cfg(feature = "codegen")]
pub struct AhoCorasickGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
}
#[cfg(feature = "codegen")]
impl AhoCorasickGen<'_> {
pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
let name = self.gen.name;
let value_type = self.gen.value_type;
writeln!(file, "pub struct {name} {{")?;
writeln!(file, " dfa: dictgen::aho_corasick::DFA,")?;
writeln!(file, " unicode: &'static dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, {value_type}>,")?;
writeln!(file, "}}")?;
writeln!(file)?;
writeln!(file, "impl {name} {{")?;
writeln!(file, " pub fn new() -> Self {{")?;
writeln!(
file,
" static NEEDLES: &'static [&'static [u8]] = &["
)?;
for (key, _value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
let key = key.as_ref();
writeln!(file, " b{key:?},")?;
}
writeln!(file, " ];")?;
writeln!(
file,
" let dfa = dictgen::aho_corasick::Builder::new()"
)?;
writeln!(
file,
" .match_kind(dictgen::aho_corasick::MatchKind::LeftmostLongest)"
)?;
writeln!(
file,
" .start_kind(dictgen::aho_corasick::StartKind::Anchored)"
)?;
writeln!(file, " .ascii_case_insensitive(true)")?;
writeln!(file, " .build(NEEDLES)")?;
writeln!(file, " .unwrap();")?;
crate::DictGen::new()
.name("UNICODE_TABLE")
.value_type(value_type)
.ordered_map()
.write(
file,
data.iter()
.filter(|(k, _)| !k.as_ref().is_ascii())
.map(|(k, v)| (k.as_ref(), v)),
)?;
writeln!(file)?;
writeln!(file, " Self {{")?;
writeln!(file, " dfa,")?;
writeln!(file, " unicode: &UNICODE_TABLE,")?;
writeln!(file, " }}")?;
writeln!(file, " }}")?;
writeln!(file)?;
writeln!(
file,
" pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static {value_type}> {{"
)?;
writeln!(
file,
" static PATTERNID_MAP: &'static [{value_type}] = &["
)?;
for (_key, value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
writeln!(file, " {value},")?;
}
writeln!(file, " ];")?;
writeln!(file, " if word.is_ascii() {{")?;
writeln!(
file,
" use dictgen::aho_corasick::Automaton as _;"
)?;
writeln!(file, " let input = dictgen::aho_corasick::Input::new(word.into_inner().as_bytes()).anchored(dictgen::aho_corasick::Anchored::Yes);")?;
writeln!(
file,
" let mat = self.dfa.try_find(&input).unwrap()?;"
)?;
writeln!(
file,
" if mat.end() == word.into_inner().len() {{"
)?;
writeln!(file, " return None;")?;
writeln!(file, " }}")?;
writeln!(file, " Some(&PATTERNID_MAP[mat.pattern()])")?;
writeln!(file, " }} else {{")?;
writeln!(file, " self.unicode.find(word)")?;
writeln!(file, " }}")?;
writeln!(file, " }}")?;
writeln!(file, "}}")?;
Ok(())
}
}

View file

@ -61,6 +61,11 @@ impl<'g> DictGen<'g> {
pub fn r#match(self) -> crate::MatchGen<'g> { pub fn r#match(self) -> crate::MatchGen<'g> {
crate::MatchGen { gen: self } crate::MatchGen { gen: self }
} }
#[cfg(feature = "aho-corasick")]
pub fn aho_corasick(self) -> crate::AhoCorasickGen<'g> {
crate::AhoCorasickGen { gen: self }
}
} }
impl Default for DictGen<'static> { impl Default for DictGen<'static> {

View file

@ -2,6 +2,8 @@
#![warn(clippy::print_stderr)] #![warn(clippy::print_stderr)]
#![warn(clippy::print_stdout)] #![warn(clippy::print_stdout)]
#[cfg(feature = "aho-corasick")]
pub mod aho_corasick;
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
mod gen; mod gen;
mod insensitive; mod insensitive;
@ -12,6 +14,9 @@ mod r#match;
mod ordered_map; mod ordered_map;
mod trie; mod trie;
#[cfg(feature = "aho-corasick")]
#[cfg(feature = "codegen")]
pub use aho_corasick::AhoCorasickGen;
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
pub use gen::*; pub use gen::*;
pub use insensitive::*; pub use insensitive::*;

View file

@ -25,7 +25,7 @@ itertools = "0.13"
edit-distance = "2.1" edit-distance = "2.1"
unicase = "2.7" unicase = "2.7"
codegenrs = "3.0" codegenrs = "3.0"
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map"] } dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map", "aho-corasick"] }
varcon = { version = "^1.0", path = "../varcon" } varcon = { version = "^1.0", path = "../varcon" }
snapbox = "0.6.5" snapbox = "0.6.5"
indexmap = "2.2.6" indexmap = "2.2.6"

File diff suppressed because it is too large Load diff

View file

@ -1,11 +1,24 @@
#![allow(clippy::wildcard_imports)] #![allow(clippy::wildcard_imports)]
#![allow(dead_code)] #![allow(dead_code)]
mod aho_corasick_codegen;
mod cased_map_codegen; mod cased_map_codegen;
mod map_codegen; mod map_codegen;
mod ordered_map_codegen; mod ordered_map_codegen;
mod trie_codegen; mod trie_codegen;
static AHO_CORASICK: std::sync::LazyLock<aho_corasick_codegen::Word> =
std::sync::LazyLock::new(aho_corasick_codegen::Word::new);
mod new {
use super::*;
#[divan::bench]
fn aho_corasick() -> aho_corasick_codegen::Word {
aho_corasick_codegen::Word::new()
}
}
mod miss { mod miss {
use super::*; use super::*;
@ -30,6 +43,11 @@ mod miss {
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> { fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
ordered_map_codegen::WORD.find(&word) ordered_map_codegen::WORD.find(&word)
} }
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
fn aho_corasick(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
AHO_CORASICK.find(&word)
}
} }
mod hit { mod hit {
@ -56,6 +74,11 @@ mod hit {
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> { fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
ordered_map_codegen::WORD.find(&word) ordered_map_codegen::WORD.find(&word)
} }
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
fn aho_corasick(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
AHO_CORASICK.find(&word)
}
} }
fn main() { fn main() {

View file

@ -38,6 +38,15 @@ fn codegen() {
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw() snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
); );
let mut aho_corasick_content = vec![];
generate_aho_corasick(&mut aho_corasick_content, "Word", DICT);
let aho_corasick_content = String::from_utf8(aho_corasick_content).unwrap();
let aho_corasick_content = codegenrs::rustfmt(&aho_corasick_content, None).unwrap();
snapbox::assert_data_eq!(
&aho_corasick_content,
snapbox::file!["../benches/benches/aho_corasick_codegen.rs"].raw()
);
snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw()); snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
} }
@ -256,3 +265,41 @@ fn generate_ordered_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]
) )
.unwrap(); .unwrap();
} }
fn generate_aho_corasick<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!(
file,
"// This file is @generated by {}",
file!().replace('\\', "/")
)
.unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file, "#![allow(clippy::redundant_static_lifetimes)]",).unwrap();
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
writeln!(file).unwrap();
let records: Vec<_> = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(|r| r.unwrap())
.collect();
dictgen::DictGen::new()
.name(name)
.value_type("&'static [&'static str]")
.aho_corasick()
.write(
file,
records.iter().map(|record| {
let mut record_fields = record.iter();
let key = record_fields.next().unwrap();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
}