From 3b43272724372fddae88442555596a5785a6275c Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 1 Jul 2021 08:52:36 -0500 Subject: [PATCH] refactor(dict): Separate dictgen concerns --- crates/codespell-dict/codegen/Cargo.toml | 2 +- crates/dictgen/Cargo.toml | 10 + crates/dictgen/src/table.rs | 3 +- crates/dictgen/src/trie.rs | 418 ++++++++++++----------- crates/misspell-dict/codegen/Cargo.toml | 2 +- crates/typos-dict/codegen/Cargo.toml | 2 +- crates/typos-vars/codegen/Cargo.toml | 2 +- crates/wikipedia-dict/codegen/Cargo.toml | 2 +- 8 files changed, 233 insertions(+), 208 deletions(-) diff --git a/crates/codespell-dict/codegen/Cargo.toml b/crates/codespell-dict/codegen/Cargo.toml index a0b3199..d67b7d6 100644 --- a/crates/codespell-dict/codegen/Cargo.toml +++ b/crates/codespell-dict/codegen/Cargo.toml @@ -22,4 +22,4 @@ unicase = "2.5" itertools = "0.10" codegenrs = "1.0" structopt = "0.3" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/dictgen/Cargo.toml b/crates/dictgen/Cargo.toml index e8426e6..8d2e15c 100644 --- a/crates/dictgen/Cargo.toml +++ b/crates/dictgen/Cargo.toml @@ -1,7 +1,17 @@ [package] name = "dictgen" version = "0.1.0" +description = "Compile-time case-insensitive map" +repository = "https://github.com/crate-ci/typos" +categories = ["development-tools", "text-processing"] +keywords = ["development", "spelling", "no_std"] +license = "MIT" edition = "2018" +[features] +default = ["std", "codegen"] +std = [] +codegen = ["std"] + [dependencies] unicase = "2.5" diff --git a/crates/dictgen/src/table.rs b/crates/dictgen/src/table.rs index 30d5923..dda4e93 100644 --- a/crates/dictgen/src/table.rs +++ b/crates/dictgen/src/table.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "codegen")] pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( file: &mut W, name: &str, @@ -46,7 +47,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( pub struct DictTable { pub keys: &'static [InsensitiveStr], pub values: &'static [V], - pub range: std::ops::RangeInclusive, + pub range: core::ops::RangeInclusive, } impl DictTable { diff --git a/crates/dictgen/src/trie.rs b/crates/dictgen/src/trie.rs index c6b849c..9218ea0 100644 --- a/crates/dictgen/src/trie.rs +++ b/crates/dictgen/src/trie.rs @@ -1,6 +1,7 @@ /// # Panics /// /// - On duplicate entry +#[cfg(feature = "codegen")] pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( file: &mut W, prefix: &str, @@ -8,123 +9,13 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( data: impl Iterator, limit: usize, ) -> Result<(), std::io::Error> { - let mut root = DynRoot::new(data); - root.burst(limit); - - let unicode_table_name = format!("{}_UNICODE_TABLE", prefix); - - writeln!( - file, - "pub static {}_TRIE: dictgen::DictTrie<{}> = dictgen::DictTrie {{", - prefix, value_type - )?; - writeln!(file, " root: &{},", gen_node_name(prefix, ""))?; - writeln!(file, " unicode: &{},", &unicode_table_name)?; - writeln!( - file, - " range: {}..={},", - root.range.start(), - root.range.end() - )?; - writeln!(file, "}};")?; - writeln!(file)?; - - crate::generate_table( - file, - &unicode_table_name, - value_type, - root.unicode.into_iter(), - )?; - writeln!(file)?; - - let mut nodes = vec![("".to_owned(), &root.root)]; - while let Some((start, node)) = nodes.pop() { - let node_name = gen_node_name(prefix, &start); - let children_name = gen_children_name(prefix, &start); - writeln!( - file, - "static {}: dictgen::DictTrieNode<{}> = dictgen::DictTrieNode {{", - node_name, value_type - )?; - writeln!( - file, - " children: {}(&{}),", - gen_type_name(&node.children), - children_name - )?; - if let Some(value) = node.value.as_ref() { - writeln!(file, " value: Some({}),", value)?; - } else { - writeln!(file, " value: None,")?; - } - writeln!(file, "}};")?; - writeln!(file)?; - - match &node.children { - DynChild::Nested(n) => { - writeln!( - file, - "static {}: [Option<&dictgen::DictTrieNode<{}>>; 26] = [", - children_name, value_type, - )?; - for b in b'a'..=b'z' { - if let Some(child) = n.get(&b) { - let c = b as char; - let next_start = format!("{}{}", start, c); - writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?; - nodes.push((next_start, child)); - } else { - writeln!(file, " None,")?; - } - } - writeln!(file, "];")?; - } - DynChild::Flat(v) => { - let table_input = v.iter().map(|(k, v)| { - let k = std::str::from_utf8(k).expect("this was originally a `str`"); - (k, v) - }); - crate::generate_table(file, &children_name, value_type, table_input)?; - } - } - writeln!(file)?; - writeln!(file)?; - } - - Ok(()) -} - -fn gen_node_name(prefix: &str, start: &str) -> String { - if start.is_empty() { - format!("{}_NODE", prefix) - } else { - let mut start = start.to_owned(); - start.make_ascii_uppercase(); - format!("{}_{}_NODE", prefix, start) - } -} - -fn gen_children_name(prefix: &str, start: &str) -> String { - if start.is_empty() { - format!("{}_CHILDREN", prefix) - } else { - let mut start = start.to_owned(); - start.make_ascii_uppercase(); - format!("{}_{}_CHILDREN", prefix, start) - } -} - -fn gen_type_name(leaf: &DynChild) -> &'static str { - match leaf { - DynChild::Nested(_) => "dictgen::DictTrieChild::Nested", - DynChild::Flat(_) => "dictgen::DictTrieChild::Flat", - } + codegen::generate_trie(file, prefix, value_type, data, limit) } pub struct DictTrie { pub root: &'static DictTrieNode, pub unicode: &'static crate::DictTable, - pub range: std::ops::RangeInclusive, + pub range: core::ops::RangeInclusive, } impl DictTrie { @@ -155,7 +46,7 @@ impl DictTrie { let remaining = &bytes[i..bytes.len()]; // Unsafe: Everything before has been proven to be ASCII, so this should be // safe. - let remaining = unsafe { std::str::from_utf8_unchecked(remaining) }; + let remaining = unsafe { core::str::from_utf8_unchecked(remaining) }; // Reuse the prior ascii check, rather than doing it again let remaining = if word.is_ascii() { unicase::UniCase::ascii(remaining) @@ -183,107 +74,230 @@ pub enum DictTrieChild { Flat(&'static crate::DictTable), } -struct DynRoot<'s, V> { - root: DynNode<'s, V>, - unicode: Vec<(&'s str, V)>, - range: std::ops::RangeInclusive, -} +#[cfg(feature = "codegen")] +mod codegen { + pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( + file: &mut W, + prefix: &str, + value_type: &str, + data: impl Iterator, + limit: usize, + ) -> Result<(), std::io::Error> { + let mut root = DynRoot::new(data); + root.burst(limit); -impl<'s, V> DynRoot<'s, V> { - fn new(data: impl Iterator) -> Self { - let mut overflow = Vec::new(); - let mut unicode = Vec::default(); - let mut smallest = usize::MAX; - let mut largest = usize::MIN; - let mut existing = std::collections::HashSet::new(); - let mut empty = None; - for (key, value) in data { - if existing.contains(key) { - panic!("Duplicate present: {}", key); - } - existing.insert(key); + let unicode_table_name = format!("{}_UNICODE_TABLE", prefix); - if key.is_empty() { - empty = Some(value); + writeln!( + file, + "pub static {}_TRIE: dictgen::DictTrie<{}> = dictgen::DictTrie {{", + prefix, value_type + )?; + writeln!(file, " root: &{},", gen_node_name(prefix, ""))?; + writeln!(file, " unicode: &{},", &unicode_table_name)?; + writeln!( + file, + " range: {}..={},", + root.range.start(), + root.range.end() + )?; + writeln!(file, "}};")?; + writeln!(file)?; + + crate::generate_table( + file, + &unicode_table_name, + value_type, + root.unicode.into_iter(), + )?; + writeln!(file)?; + + let mut nodes = vec![("".to_owned(), &root.root)]; + while let Some((start, node)) = nodes.pop() { + let node_name = gen_node_name(prefix, &start); + let children_name = gen_children_name(prefix, &start); + writeln!( + file, + "static {}: dictgen::DictTrieNode<{}> = dictgen::DictTrieNode {{", + node_name, value_type + )?; + writeln!( + file, + " children: {}(&{}),", + gen_type_name(&node.children), + children_name + )?; + if let Some(value) = node.value.as_ref() { + writeln!(file, " value: Some({}),", value)?; } else { - smallest = std::cmp::min(smallest, key.len()); - largest = std::cmp::max(largest, key.len()); - if key.bytes().all(|b| b.is_ascii_alphabetic()) { - overflow.push((key.as_bytes(), value)); - } else { - unicode.push((key, value)); - } + writeln!(file, " value: None,")?; } - } - Self { - root: DynNode { - children: DynChild::Flat(overflow), - value: empty, - }, - unicode, - range: smallest..=largest, - } - } + writeln!(file, "}};")?; + writeln!(file)?; - fn burst(&mut self, limit: usize) { - self.root.burst(limit); - } -} - -struct DynNode<'s, V> { - children: DynChild<'s, V>, - value: Option, -} - -impl<'s, V> DynNode<'s, V> { - fn burst(&mut self, limit: usize) { - self.children.burst(limit) - } -} - -enum DynChild<'s, V> { - Nested(std::collections::BTreeMap>), - Flat(Vec<(&'s [u8], V)>), -} - -impl<'s, V> DynChild<'s, V> { - fn burst(&mut self, limit: usize) { - match self { - DynChild::Nested(children) => { - for child in children.values_mut() { - child.burst(limit); + match &node.children { + DynChild::Nested(n) => { + writeln!( + file, + "static {}: [Option<&dictgen::DictTrieNode<{}>>; 26] = [", + children_name, value_type, + )?; + for b in b'a'..=b'z' { + if let Some(child) = n.get(&b) { + let c = b as char; + let next_start = format!("{}{}", start, c); + writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?; + nodes.push((next_start, child)); + } else { + writeln!(file, " None,")?; + } + } + writeln!(file, "];")?; } - } - DynChild::Flat(v) if v.len() < limit => (), - DynChild::Flat(v) => { - let mut old_v = Vec::new(); - std::mem::swap(&mut old_v, v); - let mut nodes = std::collections::BTreeMap::new(); - for (key, value) in old_v { - assert!(!key.is_empty()); - let start = key[0].to_ascii_lowercase(); - assert!(start.is_ascii_alphabetic()); - let node = nodes.entry(start).or_insert_with(|| DynNode { - children: DynChild::Flat(Vec::new()), - value: None, + DynChild::Flat(v) => { + let table_input = v.iter().map(|(k, v)| { + let k = std::str::from_utf8(k).expect("this was originally a `str`"); + (k, v) }); - let remaining = &key[1..]; - if remaining.is_empty() { - assert!(node.value.is_none()); - node.value = Some(value); + crate::generate_table(file, &children_name, value_type, table_input)?; + } + } + writeln!(file)?; + writeln!(file)?; + } + + Ok(()) + } + + fn gen_node_name(prefix: &str, start: &str) -> String { + if start.is_empty() { + format!("{}_NODE", prefix) + } else { + let mut start = start.to_owned(); + start.make_ascii_uppercase(); + format!("{}_{}_NODE", prefix, start) + } + } + + fn gen_children_name(prefix: &str, start: &str) -> String { + if start.is_empty() { + format!("{}_CHILDREN", prefix) + } else { + let mut start = start.to_owned(); + start.make_ascii_uppercase(); + format!("{}_{}_CHILDREN", prefix, start) + } + } + + fn gen_type_name(leaf: &DynChild) -> &'static str { + match leaf { + DynChild::Nested(_) => "dictgen::DictTrieChild::Nested", + DynChild::Flat(_) => "dictgen::DictTrieChild::Flat", + } + } + + struct DynRoot<'s, V> { + root: DynNode<'s, V>, + unicode: Vec<(&'s str, V)>, + range: std::ops::RangeInclusive, + } + + impl<'s, V> DynRoot<'s, V> { + fn new(data: impl Iterator) -> Self { + let mut overflow = Vec::new(); + let mut unicode = Vec::default(); + let mut smallest = usize::MAX; + let mut largest = usize::MIN; + let mut existing = std::collections::HashSet::new(); + let mut empty = None; + for (key, value) in data { + if existing.contains(key) { + panic!("Duplicate present: {}", key); + } + existing.insert(key); + + if key.is_empty() { + empty = Some(value); + } else { + smallest = std::cmp::min(smallest, key.len()); + largest = std::cmp::max(largest, key.len()); + if key.bytes().all(|b| b.is_ascii_alphabetic()) { + overflow.push((key.as_bytes(), value)); } else { - match &mut node.children { - DynChild::Nested(_) => { - unreachable!("Only overflow at this point") - } - DynChild::Flat(ref mut v) => { - v.push((remaining, value)); + unicode.push((key, value)); + } + } + } + Self { + root: DynNode { + children: DynChild::Flat(overflow), + value: empty, + }, + unicode, + range: smallest..=largest, + } + } + + fn burst(&mut self, limit: usize) { + self.root.burst(limit); + } + } + + struct DynNode<'s, V> { + children: DynChild<'s, V>, + value: Option, + } + + impl<'s, V> DynNode<'s, V> { + fn burst(&mut self, limit: usize) { + self.children.burst(limit) + } + } + + enum DynChild<'s, V> { + Nested(std::collections::BTreeMap>), + Flat(Vec<(&'s [u8], V)>), + } + + impl<'s, V> DynChild<'s, V> { + fn burst(&mut self, limit: usize) { + match self { + DynChild::Nested(children) => { + for child in children.values_mut() { + child.burst(limit); + } + } + DynChild::Flat(v) if v.len() < limit => (), + DynChild::Flat(v) => { + let mut old_v = Vec::new(); + std::mem::swap(&mut old_v, v); + let mut nodes = std::collections::BTreeMap::new(); + for (key, value) in old_v { + assert!(!key.is_empty()); + let start = key[0].to_ascii_lowercase(); + assert!(start.is_ascii_alphabetic()); + let node = nodes.entry(start).or_insert_with(|| DynNode { + children: DynChild::Flat(Vec::new()), + value: None, + }); + let remaining = &key[1..]; + if remaining.is_empty() { + assert!(node.value.is_none()); + node.value = Some(value); + } else { + match &mut node.children { + DynChild::Nested(_) => { + unreachable!("Only overflow at this point") + } + DynChild::Flat(ref mut v) => { + v.push((remaining, value)); + } } } } + *self = DynChild::Nested(nodes); + self.burst(limit); } - *self = DynChild::Nested(nodes); - self.burst(limit); } } } diff --git a/crates/misspell-dict/codegen/Cargo.toml b/crates/misspell-dict/codegen/Cargo.toml index 90f0511..a60d29b 100644 --- a/crates/misspell-dict/codegen/Cargo.toml +++ b/crates/misspell-dict/codegen/Cargo.toml @@ -23,4 +23,4 @@ itertools = "0.10" codegenrs = "1.0" structopt = "0.3" regex = "1" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/typos-dict/codegen/Cargo.toml b/crates/typos-dict/codegen/Cargo.toml index 5e1c2bc..ed4a19e 100644 --- a/crates/typos-dict/codegen/Cargo.toml +++ b/crates/typos-dict/codegen/Cargo.toml @@ -23,4 +23,4 @@ itertools = "0.10" unicase = "2.5" codegenrs = "1.0" structopt = "0.3" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/typos-vars/codegen/Cargo.toml b/crates/typos-vars/codegen/Cargo.toml index 5194326..0f06d4a 100644 --- a/crates/typos-vars/codegen/Cargo.toml +++ b/crates/typos-vars/codegen/Cargo.toml @@ -29,4 +29,4 @@ log = "0.4" env_logger = "0.7" clap-verbosity-flag = "0.3" itertools = "0.10" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] } diff --git a/crates/wikipedia-dict/codegen/Cargo.toml b/crates/wikipedia-dict/codegen/Cargo.toml index 93fe957..d7713f0 100644 --- a/crates/wikipedia-dict/codegen/Cargo.toml +++ b/crates/wikipedia-dict/codegen/Cargo.toml @@ -22,4 +22,4 @@ unicase = "2.5" itertools = "0.10" codegenrs = "1.0" structopt = "0.3" -dictgen = { version = "0.1", path = "../../dictgen" } +dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }