fix(dictgen)!: Use a builer for easier customization

This commit is contained in:
Ed Page 2024-12-28 20:58:19 -06:00
parent 910f3c8af0
commit 52822b019f
15 changed files with 72998 additions and 73360 deletions

View file

@ -51,13 +51,12 @@ fn generate<W: std::io::Write>(file: &mut W) {
let dict = parse_dict(DICT); let dict = parse_dict(DICT);
dictgen::generate_table( dictgen::DictGen::new()
file, .name("WORD_DICTIONARY")
"WORD_DICTIONARY", .value_type("&[&str]")
"&[&str]", .table()
dict.map(|kv| (kv.0, format!("&{:?}", kv.1))), .write(file, dict.map(|kv| (kv.0, format!("&{:?}", kv.1))))
) .unwrap();
.unwrap();
} }
fn parse_dict(raw: &str) -> impl Iterator<Item = (&str, Vec<&str>)> { fn parse_dict(raw: &str) -> impl Iterator<Item = (&str, Vec<&str>)> {

58
crates/dictgen/src/gen.rs Normal file
View file

@ -0,0 +1,58 @@
#[cfg(feature = "codegen")]
pub struct DictGen<'g> {
pub(crate) name: &'g str,
pub(crate) value_type: &'g str,
}
impl DictGen<'static> {
pub fn new() -> Self {
Self {
name: "DICT",
value_type: "&'static str",
}
}
}
impl<'g> DictGen<'g> {
pub fn name<'n>(self, name: &'n str) -> DictGen<'n>
where
'g: 'n,
{
DictGen {
name,
value_type: self.value_type,
}
}
pub fn value_type<'t>(self, value_type: &'t str) -> DictGen<'t>
where
'g: 't,
{
DictGen {
name: self.name,
value_type,
}
}
#[cfg(feature = "map")]
pub fn map(self) -> crate::DictMapGen<'g> {
crate::DictMapGen { gen: self }
}
pub fn table(self) -> crate::DictTableGen<'g> {
crate::DictTableGen { gen: self }
}
pub fn trie(self) -> crate::DictTrieGen<'g> {
crate::DictTrieGen {
gen: self,
limit: 64,
}
}
}
impl Default for DictGen<'static> {
fn default() -> Self {
Self::new()
}
}

View file

@ -2,11 +2,15 @@
#![warn(clippy::print_stderr)] #![warn(clippy::print_stderr)]
#![warn(clippy::print_stdout)] #![warn(clippy::print_stdout)]
#[cfg(feature = "codegen")]
mod gen;
#[cfg(feature = "map")] #[cfg(feature = "map")]
mod map; mod map;
mod table; mod table;
mod trie; mod trie;
#[cfg(feature = "codegen")]
pub use gen::*;
#[cfg(feature = "map")] #[cfg(feature = "map")]
pub use map::*; pub use map::*;
pub use table::*; pub use table::*;

View file

@ -1,46 +1,55 @@
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
pub fn generate_map<'d, W: std::io::Write, V: std::fmt::Display>( pub struct DictMapGen<'g> {
file: &mut W, pub(crate) gen: crate::DictGen<'g>,
name: &str, }
value_type: &str,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
let mut smallest = usize::MAX; #[cfg(feature = "codegen")]
let mut largest = usize::MIN; impl DictMapGen<'_> {
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
writeln!( let name = self.gen.name;
file, let value_type = self.gen.value_type;
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() { let mut smallest = usize::MAX;
format!("dictgen::InsensitiveStr::Ascii({key:?})") let mut largest = usize::MIN;
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?; writeln!(
file,
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?;
}
if largest == 0 {
smallest = 0;
}
writeln!(file, " ],")?;
writeln!(file, " values: &[")?;
for (_key, value) in data.iter() {
writeln!(file, " {value},")?;
}
writeln!(file, " ],")?;
writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;
Ok(())
} }
if largest == 0 {
smallest = 0;
}
writeln!(file, " ],")?;
writeln!(file, " values: &[")?;
for (_key, value) in data.iter() {
writeln!(file, " {value},")?;
}
writeln!(file, " ],")?;
writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;
Ok(())
} }
pub struct DictMap<V: 'static> { pub struct DictMap<V: 'static> {

View file

@ -1,46 +1,55 @@
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( pub struct DictTableGen<'g> {
file: &mut W, pub(crate) gen: crate::DictGen<'g>,
name: &str, }
value_type: &str,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
let mut smallest = usize::MAX; #[cfg(feature = "codegen")]
let mut largest = usize::MIN; impl DictTableGen<'_> {
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0));
writeln!( let name = self.gen.name;
file, let value_type = self.gen.value_type;
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() { let mut smallest = usize::MAX;
format!("dictgen::InsensitiveStr::Ascii({key:?})") let mut largest = usize::MIN;
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?; writeln!(
file,
"pub static {name}: dictgen::DictTable<{value_type}> = dictgen::DictTable {{"
)?;
writeln!(file, " keys: &[")?;
for (key, _value) in data.iter() {
smallest = std::cmp::min(smallest, key.len());
largest = std::cmp::max(largest, key.len());
let key = if key.is_ascii() {
format!("dictgen::InsensitiveStr::Ascii({key:?})")
} else {
format!("dictgen::InsensitiveStr::Unicode({key:?})")
};
writeln!(file, " {key},")?;
}
if largest == 0 {
smallest = 0;
}
writeln!(file, " ],")?;
writeln!(file, " values: &[")?;
for (_key, value) in data.iter() {
writeln!(file, " {value},")?;
}
writeln!(file, " ],")?;
writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;
Ok(())
} }
if largest == 0 {
smallest = 0;
}
writeln!(file, " ],")?;
writeln!(file, " values: &[")?;
for (_key, value) in data.iter() {
writeln!(file, " {value},")?;
}
writeln!(file, " ],")?;
writeln!(file, " range: {smallest}..={largest},")?;
writeln!(file, "}};")?;
Ok(())
} }
pub struct DictTable<V: 'static> { pub struct DictTable<V: 'static> {

View file

@ -1,15 +1,28 @@
/// # Panics
///
/// - On duplicate entry
#[cfg(feature = "codegen")] #[cfg(feature = "codegen")]
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( pub struct DictTrieGen<'g> {
file: &mut W, pub(crate) gen: crate::DictGen<'g>,
prefix: &str, pub(crate) limit: usize,
value_type: &str, }
data: impl Iterator<Item = (&'d str, V)>,
limit: usize, #[cfg(feature = "codegen")]
) -> Result<(), std::io::Error> { impl DictTrieGen<'_> {
codegen::generate_trie(file, prefix, value_type, data, limit) pub fn limit(mut self, limit: usize) -> Self {
self.limit = limit;
self
}
/// # Panics
///
/// - On duplicate entry
pub fn write<'d, W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (&'d str, V)>,
) -> Result<(), std::io::Error> {
let name = self.gen.name;
let value_type = self.gen.value_type;
codegen::generate_trie(file, name, value_type, data, self.limit)
}
} }
pub struct DictTrie<V: 'static> { pub struct DictTrie<V: 'static> {
@ -78,7 +91,7 @@ pub enum DictTrieChild<V: 'static> {
mod codegen { mod codegen {
pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W, file: &mut W,
prefix: &str, name: &str,
value_type: &str, value_type: &str,
data: impl Iterator<Item = (&'d str, V)>, data: impl Iterator<Item = (&'d str, V)>,
limit: usize, limit: usize,
@ -86,13 +99,13 @@ mod codegen {
let mut root = DynRoot::new(data); let mut root = DynRoot::new(data);
root.burst(limit); root.burst(limit);
let unicode_table_name = format!("{prefix}_UNICODE_TABLE"); let unicode_table_name = format!("{name}_UNICODE_TABLE");
writeln!( writeln!(
file, file,
"pub static {prefix}_TRIE: dictgen::DictTrie<{value_type}> = dictgen::DictTrie {{" "pub static {name}: dictgen::DictTrie<{value_type}> = dictgen::DictTrie {{"
)?; )?;
writeln!(file, " root: &{},", gen_node_name(prefix, ""))?; writeln!(file, " root: &{},", gen_node_name(name, ""))?;
writeln!(file, " unicode: &{},", &unicode_table_name)?; writeln!(file, " unicode: &{},", &unicode_table_name)?;
writeln!( writeln!(
file, file,
@ -103,18 +116,17 @@ mod codegen {
writeln!(file, "}};")?; writeln!(file, "}};")?;
writeln!(file)?; writeln!(file)?;
crate::generate_table( crate::DictGen::new()
file, .name(&unicode_table_name)
&unicode_table_name, .value_type(value_type)
value_type, .table()
root.unicode.into_iter(), .write(file, root.unicode.into_iter())?;
)?;
writeln!(file)?; writeln!(file)?;
let mut nodes = vec![("".to_owned(), &root.root)]; let mut nodes = vec![("".to_owned(), &root.root)];
while let Some((start, node)) = nodes.pop() { while let Some((start, node)) = nodes.pop() {
let node_name = gen_node_name(prefix, &start); let node_name = gen_node_name(name, &start);
let children_name = gen_children_name(prefix, &start); let children_name = gen_children_name(name, &start);
writeln!( writeln!(
file, file,
"static {node_name}: dictgen::DictTrieNode<{value_type}> = dictgen::DictTrieNode {{" "static {node_name}: dictgen::DictTrieNode<{value_type}> = dictgen::DictTrieNode {{"
@ -143,7 +155,7 @@ mod codegen {
if let Some(child) = n.get(&b) { if let Some(child) = n.get(&b) {
let c = b as char; let c = b as char;
let next_start = format!("{start}{c}"); let next_start = format!("{start}{c}");
writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?; writeln!(file, " Some(&{}),", gen_node_name(name, &next_start))?;
nodes.push((next_start, child)); nodes.push((next_start, child));
} else { } else {
writeln!(file, " None,")?; writeln!(file, " None,")?;
@ -156,7 +168,11 @@ mod codegen {
let k = std::str::from_utf8(k).expect("this was originally a `str`"); let k = std::str::from_utf8(k).expect("this was originally a `str`");
(k, v) (k, v)
}); });
crate::generate_table(file, &children_name, value_type, table_input)?; crate::DictGen::new()
.name(&children_name)
.value_type(value_type)
.table()
.write(file, table_input)?;
} }
} }
writeln!(file)?; writeln!(file)?;

View file

@ -27,31 +27,37 @@ fn generate<W: std::io::Write>(file: &mut W) {
british, british,
} = parse_dict(DICT); } = parse_dict(DICT);
dictgen::generate_table( dictgen::DictGen::new()
file, .name("MAIN_DICTIONARY")
"MAIN_DICTIONARY", .value_type("&[&str]")
"&[&str]", .table()
main.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))), .write(
) file,
.unwrap(); main.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))),
)
.unwrap();
dictgen::generate_table( dictgen::DictGen::new()
file, .name("AMERICAN_DICTIONARY")
"AMERICAN_DICTIONARY", .value_type("&[&str]")
"&[&str]", .table()
american .write(
.into_iter() file,
.map(|kv| (kv.0, format!("&{:?}", kv.1))), american
) .into_iter()
.unwrap(); .map(|kv| (kv.0, format!("&{:?}", kv.1))),
)
.unwrap();
dictgen::generate_table( dictgen::DictGen::new()
file, .name("BRITISH_DICTIONARY")
"BRITISH_DICTIONARY", .value_type("&[&str]")
"&[&str]", .table()
british.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))), .write(
) file,
.unwrap(); british.into_iter().map(|kv| (kv.0, format!("&{:?}", kv.1))),
)
.unwrap();
} }
struct Words<'s> { struct Words<'s> {

View file

@ -2,7 +2,7 @@
#![allow(clippy::unreadable_literal)] #![allow(clippy::unreadable_literal)]
#![allow(unreachable_pub)] #![allow(unreachable_pub)]
pub static WORD: dictgen::DictTable<&'static [&'static str]> = dictgen::DictTable { pub static WORD: dictgen::DictTable<&[&str]> = dictgen::DictTable {
keys: &[ keys: &[
dictgen::InsensitiveStr::Ascii("aaccess"), dictgen::InsensitiveStr::Ascii("aaccess"),
dictgen::InsensitiveStr::Ascii("aaccessibility"), dictgen::InsensitiveStr::Ascii("aaccessibility"),

View file

@ -2,7 +2,7 @@
#![allow(clippy::unreadable_literal)] #![allow(clippy::unreadable_literal)]
#![allow(unreachable_pub)] #![allow(unreachable_pub)]
pub static WORD: dictgen::DictTable<&'static [&'static str]> = dictgen::DictTable { pub static WORD: dictgen::DictTable<&[&str]> = dictgen::DictTable {
keys: &[ keys: &[
dictgen::InsensitiveStr::Ascii("aaccess"), dictgen::InsensitiveStr::Ascii("aaccess"),
dictgen::InsensitiveStr::Ascii("aaccessibility"), dictgen::InsensitiveStr::Ascii("aaccessibility"),

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -3,7 +3,7 @@ fn codegen() {
const DICT: &[u8] = include_bytes!("../assets/words.csv"); const DICT: &[u8] = include_bytes!("../assets/words.csv");
let mut trie_content = vec![]; let mut trie_content = vec![];
generate_trie(&mut trie_content, "WORD", DICT); generate_trie(&mut trie_content, "WORD_TRIE", DICT);
let trie_content = String::from_utf8(trie_content).unwrap(); let trie_content = String::from_utf8(trie_content).unwrap();
let trie_content = codegenrs::rustfmt(&trie_content, None).unwrap(); let trie_content = codegenrs::rustfmt(&trie_content, None).unwrap();
snapbox::assert_data_eq!( snapbox::assert_data_eq!(
@ -35,7 +35,7 @@ fn codegen() {
); );
} }
fn generate_trie<W: std::io::Write>(file: &mut W, prefix: &str, dict: &[u8]) { fn generate_trie<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!( writeln!(
file, file,
"// This file is @generated by {}", "// This file is @generated by {}",
@ -53,25 +53,26 @@ fn generate_trie<W: std::io::Write>(file: &mut W, prefix: &str, dict: &[u8]) {
.records() .records()
.map(|r| r.unwrap()) .map(|r| r.unwrap())
.collect(); .collect();
dictgen::generate_trie( dictgen::DictGen::new()
file, .name(name)
prefix, .value_type("&[&str]")
"&'static [&'static str]", .trie()
records.iter().map(|record| { .write(
let mut record_fields = record.iter(); file,
let key = record_fields.next().unwrap(); records.iter().map(|record| {
let value = format!( let mut record_fields = record.iter();
"&[{}]", let key = record_fields.next().unwrap();
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ") let value = format!(
); "&[{}]",
(key, value) itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
}), );
64, (key, value)
) }),
.unwrap(); )
.unwrap();
} }
fn generate_map<W: std::io::Write>(file: &mut W, prefix: &str, dict: &[u8]) { fn generate_map<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!( writeln!(
file, file,
"// This file is @generated by {}", "// This file is @generated by {}",
@ -89,24 +90,26 @@ fn generate_map<W: std::io::Write>(file: &mut W, prefix: &str, dict: &[u8]) {
.records() .records()
.map(|r| r.unwrap()) .map(|r| r.unwrap())
.collect(); .collect();
dictgen::generate_map( dictgen::DictGen::new()
file, .name(name)
prefix, .value_type("&[&str]")
"&'static [&'static str]", .map()
records.iter().map(|record| { .write(
let mut record_fields = record.iter(); file,
let key = record_fields.next().unwrap(); records.iter().map(|record| {
let value = format!( let mut record_fields = record.iter();
"&[{}]", let key = record_fields.next().unwrap();
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ") let value = format!(
); "&[{}]",
(key, value) itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
}), );
) (key, value)
.unwrap(); }),
)
.unwrap();
} }
fn generate_table<W: std::io::Write>(file: &mut W, prefix: &str, dict: &[u8]) { fn generate_table<W: std::io::Write>(file: &mut W, name: &str, dict: &[u8]) {
writeln!( writeln!(
file, file,
"// This file is @generated by {}", "// This file is @generated by {}",
@ -124,19 +127,21 @@ fn generate_table<W: std::io::Write>(file: &mut W, prefix: &str, dict: &[u8]) {
.records() .records()
.map(|r| r.unwrap()) .map(|r| r.unwrap())
.collect(); .collect();
dictgen::generate_table( dictgen::DictGen::new()
file, .name(name)
prefix, .value_type("&[&str]")
"&'static [&'static str]", .table()
records.iter().map(|record| { .write(
let mut record_fields = record.iter(); file,
let key = record_fields.next().unwrap(); records.iter().map(|record| {
let value = format!( let mut record_fields = record.iter();
"&[{}]", let key = record_fields.next().unwrap();
itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ") let value = format!(
); "&[{}]",
(key, value) itertools::join(record_fields.map(|field| format!(r#""{field}""#)), ", ")
}), );
) (key, value)
.unwrap(); }),
)
.unwrap();
} }

File diff suppressed because it is too large Load diff

View file

@ -84,24 +84,25 @@ fn generate_variations<W: Write>(file: &mut W) {
let entry_sets = entry_sets(entries.iter()); let entry_sets = entry_sets(entries.iter());
let mut referenced_symbols: HashSet<&str> = HashSet::new(); let mut referenced_symbols: HashSet<&str> = HashSet::new();
dictgen::generate_trie( dictgen::DictGen::new()
file, .name("VARS_TRIE")
"VARS", .value_type("&[(u8, &VariantsMap)]")
"&[(u8, &VariantsMap)]", .trie()
entry_sets.iter().filter_map(|kv| { .write(
let (word, data) = kv; file,
if is_always_valid(data) { entry_sets.iter().filter_map(|kv| {
// No need to convert from current form to target form let (word, data) = kv;
None if is_always_valid(data) {
} else { // No need to convert from current form to target form
referenced_symbols.extend(data.iter().map(|(s, _)| s)); None
let value = generate_link(data); } else {
Some((*word, value)) referenced_symbols.extend(data.iter().map(|(s, _)| s));
} let value = generate_link(data);
}), Some((*word, value))
64, }
) }),
.unwrap(); )
.unwrap();
let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data)); let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data));
writeln!(file).unwrap(); writeln!(file).unwrap();

View file

@ -21,13 +21,12 @@ fn generate<W: std::io::Write>(file: &mut W) {
let dict = parse_dict(DICT); let dict = parse_dict(DICT);
dictgen::generate_table( dictgen::DictGen::new()
file, .name("WORD_DICTIONARY")
"WORD_DICTIONARY", .value_type("&[&str]")
"&[&str]", .table()
dict.map(|kv| (kv.0, format!("&{:?}", kv.1))), .write(file, dict.map(|kv| (kv.0, format!("&{:?}", kv.1))))
) .unwrap();
.unwrap();
} }
fn parse_dict(raw: &str) -> impl Iterator<Item = (&str, Vec<&str>)> { fn parse_dict(raw: &str) -> impl Iterator<Item = (&str, Vec<&str>)> {