perf(dict): Benchmark cased maps

This commit is contained in:
Ed Page 2024-12-30 16:48:19 -06:00
parent 084461743a
commit b6352341f9
7 changed files with 249005 additions and 8 deletions

1
Cargo.lock generated
View file

@ -1509,6 +1509,7 @@ dependencies = [
"dictgen", "dictgen",
"divan", "divan",
"edit-distance", "edit-distance",
"heck",
"indexmap", "indexmap",
"itertools 0.13.0", "itertools 0.13.0",
"phf", "phf",

View file

@ -17,13 +17,13 @@ impl MapGen<'_> {
self self
} }
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>( pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self, &self,
file: &mut W, file: &mut W,
data: impl Iterator<Item = (&'d str, V)>, data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect(); let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0)); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
let name = self.gen.name; let name = self.gen.name;
let key_type = self.key_type(); let key_type = self.key_type();
@ -32,6 +32,7 @@ impl MapGen<'_> {
let mut smallest = usize::MAX; let mut smallest = usize::MAX;
let mut largest = usize::MIN; let mut largest = usize::MIN;
for (key, _) in data.iter() { for (key, _) in data.iter() {
let key = key.as_ref();
smallest = std::cmp::min(smallest, key.len()); smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len()); largest = std::cmp::max(largest, key.len());
} }
@ -50,6 +51,7 @@ impl MapGen<'_> {
let data = data let data = data
.iter() .iter()
.map(|(key, value)| { .map(|(key, value)| {
let key = key.as_ref();
( (
if key.is_ascii() { if key.is_ascii() {
crate::InsensitiveStr::Ascii(key) crate::InsensitiveStr::Ascii(key)
@ -70,7 +72,7 @@ impl MapGen<'_> {
let mut builder = phf_codegen::Map::new(); let mut builder = phf_codegen::Map::new();
let data = data let data = data
.iter() .iter()
.map(|(key, value)| (crate::InsensitiveAscii(key), value.to_string())) .map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string()))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
for (key, value) in data.iter() { for (key, value) in data.iter() {
builder.entry(key, value.as_str()); builder.entry(key, value.as_str());
@ -85,7 +87,7 @@ impl MapGen<'_> {
.map(|(key, value)| (key, value.to_string())) .map(|(key, value)| (key, value.to_string()))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
for (key, value) in data.iter() { for (key, value) in data.iter() {
builder.entry(key, value.as_str()); builder.entry(key.as_ref(), value.as_str());
} }
let builder = builder.build(); let builder = builder.build();
writeln!(file, " map: {builder},")?; writeln!(file, " map: {builder},")?;

View file

@ -17,13 +17,13 @@ impl OrderedMapGen<'_> {
self self
} }
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>( pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self, &self,
file: &mut W, file: &mut W,
data: impl Iterator<Item = (&'d str, V)>, data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect(); let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0)); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
let name = self.gen.name; let name = self.gen.name;
let key_type = self.key_type(); let key_type = self.key_type();
@ -38,6 +38,7 @@ impl OrderedMapGen<'_> {
)?; )?;
writeln!(file, " keys: &[")?; writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() { for (key, _value) in data.iter() {
let key = key.as_ref();
smallest = std::cmp::min(smallest, key.len()); smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len()); largest = std::cmp::max(largest, key.len());

View file

@ -30,6 +30,7 @@ snapbox = "0.6.5"
indexmap = "2.2.6" indexmap = "2.2.6"
divan = "0.1.16" divan = "0.1.16"
phf = "0.11.2" phf = "0.11.2"
heck = "0.5.0"
[lints] [lints]
workspace = true workspace = true

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,7 @@
#![allow(clippy::wildcard_imports)] #![allow(clippy::wildcard_imports)]
#![allow(dead_code)]
mod cased_map_codegen;
mod map_codegen; mod map_codegen;
mod ordered_map_codegen; mod ordered_map_codegen;
mod trie_codegen; mod trie_codegen;
@ -9,6 +11,11 @@ mod miss {
const MISS: &str = "finalizes"; const MISS: &str = "finalizes";
#[divan::bench(args = [MISS])]
fn cased_map(word: &str) -> Option<&'static &[&str]> {
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
}
#[divan::bench(args = [unicase::UniCase::new(MISS)])] #[divan::bench(args = [unicase::UniCase::new(MISS)])]
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> { fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
map_codegen::WORD.find(&word) map_codegen::WORD.find(&word)
@ -30,6 +37,11 @@ mod hit {
const HIT: &str = "finallizes"; const HIT: &str = "finallizes";
#[divan::bench(args = [HIT])]
fn cased_map(word: &str) -> Option<&'static &[&str]> {
cased_map_codegen::WORD_ASCII_LOWER.find(&word)
}
#[divan::bench(args = [unicase::UniCase::new(HIT)])] #[divan::bench(args = [unicase::UniCase::new(HIT)])]
fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> { fn map(word: unicase::UniCase<&str>) -> Option<&'static &[&str]> {
map_codegen::WORD.find(&word) map_codegen::WORD.find(&word)

View file

@ -20,6 +20,15 @@ fn codegen() {
snapbox::file!["../benches/benches/map_codegen.rs"].raw() snapbox::file!["../benches/benches/map_codegen.rs"].raw()
); );
let mut cased_map_content = vec![];
generate_cased_map(&mut cased_map_content, "WORD", DICT);
let cased_map_content = String::from_utf8(cased_map_content).unwrap();
let cased_map_content = codegenrs::rustfmt(&cased_map_content, None).unwrap();
snapbox::assert_data_eq!(
&cased_map_content,
snapbox::file!["../benches/benches/cased_map_codegen.rs"].raw()
);
let mut ordered_map_content = vec![]; let mut ordered_map_content = vec![];
generate_ordered_map(&mut ordered_map_content, "WORD", DICT); generate_ordered_map(&mut ordered_map_content, "WORD", DICT);
let ordered_map_content = String::from_utf8(ordered_map_content).unwrap(); let ordered_map_content = String::from_utf8(ordered_map_content).unwrap();
@ -72,6 +81,111 @@ fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
.unwrap(); .unwrap();
} }
fn generate_cased_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!(
file,
"// This file is @generated by {}",
file!().replace('\\', "/")
)
.unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file, "#![allow(unreachable_pub)]",).unwrap();
writeln!(file).unwrap();
let records: Vec<_> = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_reader(dict)
.records()
.map(|r| r.unwrap())
.collect();
dictgen::DictGen::new()
.name(&format!("{name}_ASCII_LOWER"))
.value_type("&[&str]")
.map()
.unicase(false)
.write(
file,
records
.iter()
.filter(|r| r.iter().next().unwrap().is_ascii())
.map(|record| {
let mut record_fields = record.iter();
let key = record_fields.next().unwrap();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
dictgen::DictGen::new()
.name(&format!("{name}_ASCII_UPPER"))
.value_type("&[&str]")
.map()
.unicase(false)
.write(
file,
records
.iter()
.filter(|r| r.iter().next().unwrap().is_ascii())
.map(|record| {
use heck::ToShoutySnakeCase;
let mut record_fields = record.iter();
let key = record_fields.next().unwrap().to_shouty_snake_case();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
dictgen::DictGen::new()
.name(&format!("{name}_ASCII_TITLE"))
.value_type("&[&str]")
.map()
.unicase(false)
.write(
file,
records
.iter()
.filter(|r| r.iter().next().unwrap().is_ascii())
.map(|record| {
use heck::ToTitleCase;
let mut record_fields = record.iter();
let key = record_fields.next().unwrap().to_title_case();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
dictgen::DictGen::new()
.name(&format!("{name}_UNICODE"))
.value_type("&[&str]")
.ordered_map()
.write(
file,
records
.iter()
.filter(|r| !r.iter().next().unwrap().is_ascii())
.map(|record| {
let mut record_fields = record.iter();
let key = record_fields.next().unwrap();
let value = format!(
"&[{}]",
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
);
(key, value)
}),
)
.unwrap();
}
fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) { fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!( writeln!(
file, file,