mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-25 16:09:03 -05:00
perf(dict): Benchmark cased maps
This commit is contained in:
parent
084461743a
commit
b6352341f9
7 changed files with 249005 additions and 8 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1509,6 +1509,7 @@ dependencies = [
|
||||||
"dictgen",
|
"dictgen",
|
||||||
"divan",
|
"divan",
|
||||||
"edit-distance",
|
"edit-distance",
|
||||||
|
"heck",
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"itertools 0.13.0",
|
"itertools 0.13.0",
|
||||||
"phf",
|
"phf",
|
||||||
|
|
|
@ -17,13 +17,13 @@ impl MapGen<'_> {
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
|
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||||
&self,
|
&self,
|
||||||
file: &mut W,
|
file: &mut W,
|
||||||
data: impl Iterator<Item = (&'d str, V)>,
|
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
let mut data: Vec<_> = data.collect();
|
let mut data: Vec<_> = data.collect();
|
||||||
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
|
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||||
|
|
||||||
let name = self.gen.name;
|
let name = self.gen.name;
|
||||||
let key_type = self.key_type();
|
let key_type = self.key_type();
|
||||||
|
@ -32,6 +32,7 @@ impl MapGen<'_> {
|
||||||
let mut smallest = usize::MAX;
|
let mut smallest = usize::MAX;
|
||||||
let mut largest = usize::MIN;
|
let mut largest = usize::MIN;
|
||||||
for (key, _) in data.iter() {
|
for (key, _) in data.iter() {
|
||||||
|
let key = key.as_ref();
|
||||||
smallest = std::cmp::min(smallest, key.len());
|
smallest = std::cmp::min(smallest, key.len());
|
||||||
largest = std::cmp::max(largest, key.len());
|
largest = std::cmp::max(largest, key.len());
|
||||||
}
|
}
|
||||||
|
@ -50,6 +51,7 @@ impl MapGen<'_> {
|
||||||
let data = data
|
let data = data
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(key, value)| {
|
.map(|(key, value)| {
|
||||||
|
let key = key.as_ref();
|
||||||
(
|
(
|
||||||
if key.is_ascii() {
|
if key.is_ascii() {
|
||||||
crate::InsensitiveStr::Ascii(key)
|
crate::InsensitiveStr::Ascii(key)
|
||||||
|
@ -70,7 +72,7 @@ impl MapGen<'_> {
|
||||||
let mut builder = phf_codegen::Map::new();
|
let mut builder = phf_codegen::Map::new();
|
||||||
let data = data
|
let data = data
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(key, value)| (crate::InsensitiveAscii(key), value.to_string()))
|
.map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string()))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
for (key, value) in data.iter() {
|
for (key, value) in data.iter() {
|
||||||
builder.entry(key, value.as_str());
|
builder.entry(key, value.as_str());
|
||||||
|
@ -85,7 +87,7 @@ impl MapGen<'_> {
|
||||||
.map(|(key, value)| (key, value.to_string()))
|
.map(|(key, value)| (key, value.to_string()))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
for (key, value) in data.iter() {
|
for (key, value) in data.iter() {
|
||||||
builder.entry(key, value.as_str());
|
builder.entry(key.as_ref(), value.as_str());
|
||||||
}
|
}
|
||||||
let builder = builder.build();
|
let builder = builder.build();
|
||||||
writeln!(file, " map: {builder},")?;
|
writeln!(file, " map: {builder},")?;
|
||||||
|
|
|
@ -17,13 +17,13 @@ impl OrderedMapGen<'_> {
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
|
pub fn write<W: std::io::Write, V: std::fmt::Display>(
|
||||||
&self,
|
&self,
|
||||||
file: &mut W,
|
file: &mut W,
|
||||||
data: impl Iterator<Item = (&'d str, V)>,
|
data: impl Iterator<Item = (impl AsRef<str>, V)>,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
let mut data: Vec<_> = data.collect();
|
let mut data: Vec<_> = data.collect();
|
||||||
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
|
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
|
||||||
|
|
||||||
let name = self.gen.name;
|
let name = self.gen.name;
|
||||||
let key_type = self.key_type();
|
let key_type = self.key_type();
|
||||||
|
@ -38,6 +38,7 @@ impl OrderedMapGen<'_> {
|
||||||
)?;
|
)?;
|
||||||
writeln!(file, " keys: &[")?;
|
writeln!(file, " keys: &[")?;
|
||||||
for (key, _value) in data.iter() {
|
for (key, _value) in data.iter() {
|
||||||
|
let key = key.as_ref();
|
||||||
smallest = std::cmp::min(smallest, key.len());
|
smallest = std::cmp::min(smallest, key.len());
|
||||||
largest = std::cmp::max(largest, key.len());
|
largest = std::cmp::max(largest, key.len());
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ snapbox = "0.6.5"
|
||||||
indexmap = "2.2.6"
|
indexmap = "2.2.6"
|
||||||
divan = "0.1.16"
|
divan = "0.1.16"
|
||||||
phf = "0.11.2"
|
phf = "0.11.2"
|
||||||
|
heck = "0.5.0"
|
||||||
|
|
||||||
[lints]
|
[lints]
|
||||||
workspace = true
|
workspace = true
|
||||||
|
|
248866
crates/typos-dict/benches/benches/cased_map_codegen.rs
Normal file
248866
crates/typos-dict/benches/benches/cased_map_codegen.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,7 @@
|
||||||
#![allow(clippy::wildcard_imports)]
|
#![allow(clippy::wildcard_imports)]
|
||||||
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
mod cased_map_codegen;
|
||||||
mod map_codegen;
|
mod map_codegen;
|
||||||
mod ordered_map_codegen;
|
mod ordered_map_codegen;
|
||||||
mod trie_codegen;
|
mod trie_codegen;
|
||||||
|
@ -9,6 +11,11 @@ mod miss {
|
||||||
|
|
||||||
const MISS: &str = "finalizes";
|
const MISS: &str = "finalizes";
|
||||||
|
|
||||||
|
#[divan::bench(args = [MISS])]
|
||||||
|
fn cased_map(word: &str) -> Option<&'static &[&str]> {
|
||||||
|
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
|
||||||
|
}
|
||||||
|
|
||||||
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
|
#[divan::bench(args = [unicase::UniCase::new(MISS)])]
|
||||||
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
map_codegen::WORD.find(&word)
|
map_codegen::WORD.find(&word)
|
||||||
|
@ -30,6 +37,11 @@ mod hit {
|
||||||
|
|
||||||
const HIT: &str = "finallizes";
|
const HIT: &str = "finallizes";
|
||||||
|
|
||||||
|
#[divan::bench(args = [HIT])]
|
||||||
|
fn cased_map(word: &str) -> Option<&'static &[&str]> {
|
||||||
|
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
|
||||||
|
}
|
||||||
|
|
||||||
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
|
#[divan::bench(args = [unicase::UniCase::new(HIT)])]
|
||||||
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
|
||||||
map_codegen::WORD.find(&word)
|
map_codegen::WORD.find(&word)
|
||||||
|
|
|
@ -20,6 +20,15 @@ fn codegen() {
|
||||||
snapbox::file!["../benches/benches/map_codegen.rs"].raw()
|
snapbox::file!["../benches/benches/map_codegen.rs"].raw()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut cased_map_content = vec![];
|
||||||
|
generate_cased_map(&mut cased_map_content, "WORD", DICT);
|
||||||
|
let cased_map_content = String::from_utf8(cased_map_content).unwrap();
|
||||||
|
let cased_map_content = codegenrs::rustfmt(&cased_map_content, None).unwrap();
|
||||||
|
snapbox::assert_data_eq!(
|
||||||
|
&cased_map_content,
|
||||||
|
snapbox::file!["../benches/benches/cased_map_codegen.rs"].raw()
|
||||||
|
);
|
||||||
|
|
||||||
let mut ordered_map_content = vec![];
|
let mut ordered_map_content = vec![];
|
||||||
generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
|
generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
|
||||||
let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
|
let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
|
||||||
|
@ -72,6 +81,111 @@ fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn generate_cased_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
|
writeln!(
|
||||||
|
file,
|
||||||
|
"// This file is @generated by {}",
|
||||||
|
file!().replace('\\', "/")
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
|
||||||
|
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
|
||||||
|
writeln!(file).unwrap();
|
||||||
|
|
||||||
|
let records: Vec<_> = csv::ReaderBuilder::new()
|
||||||
|
.has_headers(false)
|
||||||
|
.flexible(true)
|
||||||
|
.from_reader(dict)
|
||||||
|
.records()
|
||||||
|
.map(|r| r.unwrap())
|
||||||
|
.collect();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_ASCII_LOWER"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.map()
|
||||||
|
.unicase(false)
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_ASCII_UPPER"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.map()
|
||||||
|
.unicase(false)
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
use heck::ToShoutySnakeCase;
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap().to_shouty_snake_case();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_ASCII_TITLE"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.map()
|
||||||
|
.unicase(false)
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
use heck::ToTitleCase;
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap().to_title_case();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
dictgen::DictGen::new()
|
||||||
|
.name(&format!("{name}_UNICODE"))
|
||||||
|
.value_type("&[&str]")
|
||||||
|
.ordered_map()
|
||||||
|
.write(
|
||||||
|
file,
|
||||||
|
records
|
||||||
|
.iter()
|
||||||
|
.filter(|r| !r.iter().next().unwrap().is_ascii())
|
||||||
|
.map(|record| {
|
||||||
|
let mut record_fields = record.iter();
|
||||||
|
let key = record_fields.next().unwrap();
|
||||||
|
let value = format!(
|
||||||
|
"&[{}]",
|
||||||
|
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
|
||||||
|
);
|
||||||
|
(key, value)
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
|
|
Loading…
Add table
Reference in a new issue