mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-11 09:11:39 -05:00
feat(dictgen): Add aho-corasick support
This commit is contained in:
parent
44cf2f8cf6
commit
7984d47095
9 changed files with 138282 additions and 3 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
@ -42,9 +42,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "1.1.2"
|
version = "1.1.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
@ -449,6 +449,7 @@ dependencies = [
|
||||||
name = "dictgen"
|
name = "dictgen"
|
||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
"phf",
|
"phf",
|
||||||
"phf_codegen",
|
"phf_codegen",
|
||||||
"phf_shared",
|
"phf_shared",
|
||||||
|
|
|
@ -19,12 +19,14 @@ default = ["std"]
|
||||||
std = []
|
std = []
|
||||||
codegen = ["std", "dep:phf_codegen"]
|
codegen = ["std", "dep:phf_codegen"]
|
||||||
map = ["dep:phf", "dep:phf_shared"]
|
map = ["dep:phf", "dep:phf_shared"]
|
||||||
|
aho-corasick = ["dep:aho-corasick"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
unicase = "2.7"
|
unicase = "2.7"
|
||||||
phf = { version = "0.11", features = ["unicase"], optional = true }
|
phf = { version = "0.11", features = ["unicase"], optional = true }
|
||||||
phf_shared = { version = "0.11", optional = true }
|
phf_shared = { version = "0.11", optional = true }
|
||||||
phf_codegen = { version = "0.11", optional = true }
|
phf_codegen = { version = "0.11", optional = true }
|
||||||
|
aho-corasick = { version = "1.1.3", optional = true }
|
||||||
|
|
||||||
[lints]
|
[lints]
|
||||||
workspace = true
|
workspace = true
|
||||||
|
|
112
crates/dictgen/src/aho_corasick.rs
Normal file
112
crates/dictgen/src/aho_corasick.rs
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
pub use ::aho_corasick::automaton::Automaton;
|
||||||
|
pub use ::aho_corasick::dfa::Builder;
|
||||||
|
pub use ::aho_corasick::dfa::DFA;
|
||||||
|
pub use ::aho_corasick::Anchored;
|
||||||
|
pub use ::aho_corasick::Input;
|
||||||
|
pub use ::aho_corasick::MatchKind;
|
||||||
|
pub use ::aho_corasick::StartKind;
|
||||||
|
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
pub struct AhoCorasickGen<'g> {
|
||||||
|
pub(crate) gen: crate::DictGen<'g>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
impl AhoCorasickGen<'_> {
|
||||||
|
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||||
|
&self,
|
||||||
|
file: &mut W,
|
||||||
|
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let mut data: Vec<_> = data.collect();
|
||||||
|
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||||
|
|
||||||
|
let name = self.gen.name;
|
||||||
|
let value_type = self.gen.value_type;
|
||||||
|
|
||||||
|
writeln!(file, "pub struct {name} {{")?;
|
||||||
|
writeln!(file, " dfa: dictgen::aho_corasick::DFA,")?;
|
||||||
|
writeln!(file, " unicode: &'static dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, {value_type}>,")?;
|
||||||
|
writeln!(file, "}}")?;
|
||||||
|
writeln!(file)?;
|
||||||
|
writeln!(file, "impl {name} {{")?;
|
||||||
|
writeln!(file, " pub fn new() -> Self {{")?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" static NEEDLES: &'static [&'static [u8]] = &["
|
||||||
|
)?;
|
||||||
|
for (key, _value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
|
||||||
|
let key = key.as_ref();
|
||||||
|
writeln!(file, " b{key:?},")?;
|
||||||
|
}
|
||||||
|
writeln!(file, " ];")?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" let dfa = dictgen::aho_corasick::Builder::new()"
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" .match_kind(dictgen::aho_corasick::MatchKind::LeftmostLongest)"
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" .start_kind(dictgen::aho_corasick::StartKind::Anchored)"
|
||||||
|
)?;
|
||||||
|
writeln!(file, " .ascii_case_insensitive(true)")?;
|
||||||
|
writeln!(file, " .build(NEEDLES)")?;
|
||||||
|
writeln!(file, " .unwrap();")?;
|
||||||
|
crate::DictGen::new()
|
||||||
|
.name("UNICODE_TABLE")
|
||||||
|
.value_type(value_type)
|
||||||
|
.ordered_map()
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
data.iter()
|
||||||
|
.filter(|(k, _)| !k.as_ref().is_ascii())
|
||||||
|
.map(|(k, v)| (k.as_ref(), v)),
|
||||||
|
)?;
|
||||||
|
writeln!(file)?;
|
||||||
|
writeln!(file, " Self {{")?;
|
||||||
|
writeln!(file, " dfa,")?;
|
||||||
|
writeln!(file, " unicode: &UNICODE_TABLE,")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file)?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static {value_type}> {{"
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" static PATTERNID_MAP: &'static [{value_type}] = &["
|
||||||
|
)?;
|
||||||
|
for (_key, value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
|
||||||
|
writeln!(file, " {value},")?;
|
||||||
|
}
|
||||||
|
writeln!(file, " ];")?;
|
||||||
|
writeln!(file, " if word.is_ascii() {{")?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" use dictgen::aho_corasick::Automaton as _;"
|
||||||
|
)?;
|
||||||
|
writeln!(file, " let input = dictgen::aho_corasick::Input::new(word.into_inner().as_bytes()).anchored(dictgen::aho_corasick::Anchored::Yes);")?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" let mat = self.dfa.try_find(&input).unwrap()?;"
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
" if mat.end() == word.into_inner().len() {{"
|
||||||
|
)?;
|
||||||
|
writeln!(file, " return None;")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file, " Some(&PATTERNID_MAP[mat.pattern()])")?;
|
||||||
|
writeln!(file, " }} else {{")?;
|
||||||
|
writeln!(file, " self.unicode.find(word)")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file, " }}")?;
|
||||||
|
writeln!(file, "}}")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
|
@ -61,6 +61,11 @@ impl<'g> DictGen<'g> {
|
||||||
pub fn r#match(self) -> crate::MatchGen<'g> {
|
pub fn r#match(self) -> crate::MatchGen<'g> {
|
||||||
crate::MatchGen { gen: self }
|
crate::MatchGen { gen: self }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "aho-corasick")]
|
||||||
|
pub fn aho_corasick(self) -> crate::AhoCorasickGen<'g> {
|
||||||
|
crate::AhoCorasickGen { gen: self }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for DictGen<'static> {
|
impl Default for DictGen<'static> {
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
#![warn(clippy::print_stderr)]
|
#![warn(clippy::print_stderr)]
|
||||||
#![warn(clippy::print_stdout)]
|
#![warn(clippy::print_stdout)]
|
||||||
|
|
||||||
|
#[cfg(feature = "aho-corasick")]
|
||||||
|
pub mod aho_corasick;
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
mod gen;
|
mod gen;
|
||||||
mod insensitive;
|
mod insensitive;
|
||||||
|
@ -12,6 +14,9 @@ mod r#match;
|
||||||
mod ordered_map;
|
mod ordered_map;
|
||||||
mod trie;
|
mod trie;
|
||||||
|
|
||||||
|
#[cfg(feature = "aho-corasick")]
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
pub use aho_corasick::AhoCorasickGen;
|
||||||
#[cfg(feature = "codegen")]
|
#[cfg(feature = "codegen")]
|
||||||
pub use gen::*;
|
pub use gen::*;
|
||||||
pub use insensitive::*;
|
pub use insensitive::*;
|
||||||
|
|
|
@ -25,7 +25,7 @@ itertools = "0.13"
|
||||||
edit-distance = "2.1"
|
edit-distance = "2.1"
|
||||||
unicase = "2.7"
|
unicase = "2.7"
|
||||||
codegenrs = "3.0"
|
codegenrs = "3.0"
|
||||||
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map"] }
|
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map", "aho-corasick"] }
|
||||||
varcon = { version = "^1.0", path = "../varcon" }
|
varcon = { version = "^1.0", path = "../varcon" }
|
||||||
snapbox = "0.6.5"
|
snapbox = "0.6.5"
|
||||||
indexmap = "2.2.6"
|
indexmap = "2.2.6"
|
||||||
|
|
138084
crates/typos-dict/benches/benches/aho_corasick_codegen.rs
Normal file
138084
crates/typos-dict/benches/benches/aho_corasick_codegen.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,11 +1,24 @@
|
||||||
#![allow(clippy::wildcard_imports)]
|
#![allow(clippy::wildcard_imports)]
|
||||||
#![allow(dead_code)]
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
mod aho_corasick_codegen;
|
||||||
mod cased_map_codegen;
|
mod cased_map_codegen;
|
||||||
mod map_codegen;
|
mod map_codegen;
|
||||||
mod ordered_map_codegen;
|
mod ordered_map_codegen;
|
||||||
mod trie_codegen;
|
mod trie_codegen;
|
||||||
|
|
||||||
|
static AHO_CORASICK: std::sync::LazyLock<aho_corasick_codegen::Word> =
|
||||||
|
std::sync::LazyLock::new(aho_corasick_codegen::Word::new);
|
||||||
|
|
||||||
|
mod new {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[divan::bench]
|
||||||
|
fn aho_corasick() -> aho_corasick_codegen::Word {
|
||||||
|
aho_corasick_codegen::Word::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mod miss {
|
mod miss {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
@ -30,6 +43,11 @@ mod miss {
|
||||||
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
ordered_map_codegen::WORD.find(&word)
|
ordered_map_codegen::WORD.find(&word)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
|
||||||
|
fn aho_corasick(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
|
AHO_CORASICK.find(&word)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mod hit {
|
mod hit {
|
||||||
|
@ -56,6 +74,11 @@ mod hit {
|
||||||
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
fn ordered_map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
ordered_map_codegen::WORD.find(&word)
|
ordered_map_codegen::WORD.find(&word)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
|
||||||
|
fn aho_corasick(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
|
AHO_CORASICK.find(&word)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
|
|
@ -38,6 +38,15 @@ fn codegen() {
|
||||||
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
|
snapbox::file!["../benches/benches/ordered_map_codegen.rs"].raw()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut aho_corasick_content = vec![];
|
||||||
|
generate_aho_corasick(&mut aho_corasick_content, "Word", DICT);
|
||||||
|
let aho_corasick_content = String::from_utf8(aho_corasick_content).unwrap();
|
||||||
|
let aho_corasick_content = codegenrs::rustfmt(&aho_corasick_content, None).unwrap();
|
||||||
|
snapbox::assert_data_eq!(
|
||||||
|
&aho_corasick_content,
|
||||||
|
snapbox::file!["../benches/benches/aho_corasick_codegen.rs"].raw()
|
||||||
|
);
|
||||||
|
|
||||||
snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
|
snapbox::assert_data_eq!(&map_content, snapbox::file!["../src/word_codegen.rs"].raw());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -256,3 +265,41 @@ fn generate_ordered_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn generate_aho_corasick<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"// This file is @generated by {}",
|
||||||
|
file!().replace('\\', "/")
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
|
||||||
|
writeln!(file, "#![allow(clippy::redundant_static_lifetimes)]",).unwrap();
|
||||||
|
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
|
||||||
|
writeln!(file).unwrap();
|
||||||
|
|
||||||
|
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||||
|
.has_headers(false)
|
||||||
|
.flexible(true)
|
||||||
|
.from_reader(dict)
|
||||||
|
.records()
|
||||||
|
.map(|r| r.unwrap())
|
||||||
|
.collect();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(name)
|
||||||
|
.value_type("&'static [&'static str]")
|
||||||
|
.aho_corasick()
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records.iter().map(|record| {
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue