mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-26 16:39:07 -05:00
refactor(dict): Separate dictgen concerns
This commit is contained in:
parent
97015b3a95
commit
3b43272724
8 changed files with 233 additions and 208 deletions
|
@ -22,4 +22,4 @@ unicase = "2.5"
|
|||
itertools = "0.10"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -1,7 +1,17 @@
|
|||
[package]
|
||||
name = "dictgen"
|
||||
version = "0.1.0"
|
||||
description = "Compile-time case-insensitive map"
|
||||
repository = "https://github.com/crate-ci/typos"
|
||||
categories = ["development-tools", "text-processing"]
|
||||
keywords = ["development", "spelling", "no_std"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
|
||||
[features]
|
||||
default = ["std", "codegen"]
|
||||
std = []
|
||||
codegen = ["std"]
|
||||
|
||||
[dependencies]
|
||||
unicase = "2.5"
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#[cfg(feature = "codegen")]
|
||||
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||
file: &mut W,
|
||||
name: &str,
|
||||
|
@ -46,7 +47,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
|||
pub struct DictTable<V: 'static> {
|
||||
pub keys: &'static [InsensitiveStr],
|
||||
pub values: &'static [V],
|
||||
pub range: std::ops::RangeInclusive<usize>,
|
||||
pub range: core::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<V> DictTable<V> {
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/// # Panics
|
||||
///
|
||||
/// - On duplicate entry
|
||||
#[cfg(feature = "codegen")]
|
||||
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||
file: &mut W,
|
||||
prefix: &str,
|
||||
|
@ -8,123 +9,13 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
|||
data: impl Iterator<Item = (&'d str, V)>,
|
||||
limit: usize,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut root = DynRoot::new(data);
|
||||
root.burst(limit);
|
||||
|
||||
let unicode_table_name = format!("{}_UNICODE_TABLE", prefix);
|
||||
|
||||
writeln!(
|
||||
file,
|
||||
"pub static {}_TRIE: dictgen::DictTrie<{}> = dictgen::DictTrie {{",
|
||||
prefix, value_type
|
||||
)?;
|
||||
writeln!(file, " root: &{},", gen_node_name(prefix, ""))?;
|
||||
writeln!(file, " unicode: &{},", &unicode_table_name)?;
|
||||
writeln!(
|
||||
file,
|
||||
" range: {}..={},",
|
||||
root.range.start(),
|
||||
root.range.end()
|
||||
)?;
|
||||
writeln!(file, "}};")?;
|
||||
writeln!(file)?;
|
||||
|
||||
crate::generate_table(
|
||||
file,
|
||||
&unicode_table_name,
|
||||
value_type,
|
||||
root.unicode.into_iter(),
|
||||
)?;
|
||||
writeln!(file)?;
|
||||
|
||||
let mut nodes = vec![("".to_owned(), &root.root)];
|
||||
while let Some((start, node)) = nodes.pop() {
|
||||
let node_name = gen_node_name(prefix, &start);
|
||||
let children_name = gen_children_name(prefix, &start);
|
||||
writeln!(
|
||||
file,
|
||||
"static {}: dictgen::DictTrieNode<{}> = dictgen::DictTrieNode {{",
|
||||
node_name, value_type
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
" children: {}(&{}),",
|
||||
gen_type_name(&node.children),
|
||||
children_name
|
||||
)?;
|
||||
if let Some(value) = node.value.as_ref() {
|
||||
writeln!(file, " value: Some({}),", value)?;
|
||||
} else {
|
||||
writeln!(file, " value: None,")?;
|
||||
}
|
||||
writeln!(file, "}};")?;
|
||||
writeln!(file)?;
|
||||
|
||||
match &node.children {
|
||||
DynChild::Nested(n) => {
|
||||
writeln!(
|
||||
file,
|
||||
"static {}: [Option<&dictgen::DictTrieNode<{}>>; 26] = [",
|
||||
children_name, value_type,
|
||||
)?;
|
||||
for b in b'a'..=b'z' {
|
||||
if let Some(child) = n.get(&b) {
|
||||
let c = b as char;
|
||||
let next_start = format!("{}{}", start, c);
|
||||
writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?;
|
||||
nodes.push((next_start, child));
|
||||
} else {
|
||||
writeln!(file, " None,")?;
|
||||
}
|
||||
}
|
||||
writeln!(file, "];")?;
|
||||
}
|
||||
DynChild::Flat(v) => {
|
||||
let table_input = v.iter().map(|(k, v)| {
|
||||
let k = std::str::from_utf8(k).expect("this was originally a `str`");
|
||||
(k, v)
|
||||
});
|
||||
crate::generate_table(file, &children_name, value_type, table_input)?;
|
||||
}
|
||||
}
|
||||
writeln!(file)?;
|
||||
writeln!(file)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn gen_node_name(prefix: &str, start: &str) -> String {
|
||||
if start.is_empty() {
|
||||
format!("{}_NODE", prefix)
|
||||
} else {
|
||||
let mut start = start.to_owned();
|
||||
start.make_ascii_uppercase();
|
||||
format!("{}_{}_NODE", prefix, start)
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_children_name(prefix: &str, start: &str) -> String {
|
||||
if start.is_empty() {
|
||||
format!("{}_CHILDREN", prefix)
|
||||
} else {
|
||||
let mut start = start.to_owned();
|
||||
start.make_ascii_uppercase();
|
||||
format!("{}_{}_CHILDREN", prefix, start)
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
|
||||
match leaf {
|
||||
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
|
||||
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
|
||||
}
|
||||
codegen::generate_trie(file, prefix, value_type, data, limit)
|
||||
}
|
||||
|
||||
pub struct DictTrie<V: 'static> {
|
||||
pub root: &'static DictTrieNode<V>,
|
||||
pub unicode: &'static crate::DictTable<V>,
|
||||
pub range: std::ops::RangeInclusive<usize>,
|
||||
pub range: core::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<V> DictTrie<V> {
|
||||
|
@ -155,7 +46,7 @@ impl<V> DictTrie<V> {
|
|||
let remaining = &bytes[i..bytes.len()];
|
||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||
// safe.
|
||||
let remaining = unsafe { std::str::from_utf8_unchecked(remaining) };
|
||||
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
||||
// Reuse the prior ascii check, rather than doing it again
|
||||
let remaining = if word.is_ascii() {
|
||||
unicase::UniCase::ascii(remaining)
|
||||
|
@ -183,107 +74,230 @@ pub enum DictTrieChild<V: 'static> {
|
|||
Flat(&'static crate::DictTable<V>),
|
||||
}
|
||||
|
||||
struct DynRoot<'s, V> {
|
||||
root: DynNode<'s, V>,
|
||||
unicode: Vec<(&'s str, V)>,
|
||||
range: std::ops::RangeInclusive<usize>,
|
||||
}
|
||||
#[cfg(feature = "codegen")]
|
||||
mod codegen {
|
||||
pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||
file: &mut W,
|
||||
prefix: &str,
|
||||
value_type: &str,
|
||||
data: impl Iterator<Item = (&'d str, V)>,
|
||||
limit: usize,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut root = DynRoot::new(data);
|
||||
root.burst(limit);
|
||||
|
||||
impl<'s, V> DynRoot<'s, V> {
|
||||
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
|
||||
let mut overflow = Vec::new();
|
||||
let mut unicode = Vec::default();
|
||||
let mut smallest = usize::MAX;
|
||||
let mut largest = usize::MIN;
|
||||
let mut existing = std::collections::HashSet::new();
|
||||
let mut empty = None;
|
||||
for (key, value) in data {
|
||||
if existing.contains(key) {
|
||||
panic!("Duplicate present: {}", key);
|
||||
}
|
||||
existing.insert(key);
|
||||
let unicode_table_name = format!("{}_UNICODE_TABLE", prefix);
|
||||
|
||||
if key.is_empty() {
|
||||
empty = Some(value);
|
||||
writeln!(
|
||||
file,
|
||||
"pub static {}_TRIE: dictgen::DictTrie<{}> = dictgen::DictTrie {{",
|
||||
prefix, value_type
|
||||
)?;
|
||||
writeln!(file, " root: &{},", gen_node_name(prefix, ""))?;
|
||||
writeln!(file, " unicode: &{},", &unicode_table_name)?;
|
||||
writeln!(
|
||||
file,
|
||||
" range: {}..={},",
|
||||
root.range.start(),
|
||||
root.range.end()
|
||||
)?;
|
||||
writeln!(file, "}};")?;
|
||||
writeln!(file)?;
|
||||
|
||||
crate::generate_table(
|
||||
file,
|
||||
&unicode_table_name,
|
||||
value_type,
|
||||
root.unicode.into_iter(),
|
||||
)?;
|
||||
writeln!(file)?;
|
||||
|
||||
let mut nodes = vec![("".to_owned(), &root.root)];
|
||||
while let Some((start, node)) = nodes.pop() {
|
||||
let node_name = gen_node_name(prefix, &start);
|
||||
let children_name = gen_children_name(prefix, &start);
|
||||
writeln!(
|
||||
file,
|
||||
"static {}: dictgen::DictTrieNode<{}> = dictgen::DictTrieNode {{",
|
||||
node_name, value_type
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
" children: {}(&{}),",
|
||||
gen_type_name(&node.children),
|
||||
children_name
|
||||
)?;
|
||||
if let Some(value) = node.value.as_ref() {
|
||||
writeln!(file, " value: Some({}),", value)?;
|
||||
} else {
|
||||
smallest = std::cmp::min(smallest, key.len());
|
||||
largest = std::cmp::max(largest, key.len());
|
||||
if key.bytes().all(|b| b.is_ascii_alphabetic()) {
|
||||
overflow.push((key.as_bytes(), value));
|
||||
} else {
|
||||
unicode.push((key, value));
|
||||
}
|
||||
writeln!(file, " value: None,")?;
|
||||
}
|
||||
}
|
||||
Self {
|
||||
root: DynNode {
|
||||
children: DynChild::Flat(overflow),
|
||||
value: empty,
|
||||
},
|
||||
unicode,
|
||||
range: smallest..=largest,
|
||||
}
|
||||
}
|
||||
writeln!(file, "}};")?;
|
||||
writeln!(file)?;
|
||||
|
||||
fn burst(&mut self, limit: usize) {
|
||||
self.root.burst(limit);
|
||||
}
|
||||
}
|
||||
|
||||
struct DynNode<'s, V> {
|
||||
children: DynChild<'s, V>,
|
||||
value: Option<V>,
|
||||
}
|
||||
|
||||
impl<'s, V> DynNode<'s, V> {
|
||||
fn burst(&mut self, limit: usize) {
|
||||
self.children.burst(limit)
|
||||
}
|
||||
}
|
||||
|
||||
enum DynChild<'s, V> {
|
||||
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
|
||||
Flat(Vec<(&'s [u8], V)>),
|
||||
}
|
||||
|
||||
impl<'s, V> DynChild<'s, V> {
|
||||
fn burst(&mut self, limit: usize) {
|
||||
match self {
|
||||
DynChild::Nested(children) => {
|
||||
for child in children.values_mut() {
|
||||
child.burst(limit);
|
||||
match &node.children {
|
||||
DynChild::Nested(n) => {
|
||||
writeln!(
|
||||
file,
|
||||
"static {}: [Option<&dictgen::DictTrieNode<{}>>; 26] = [",
|
||||
children_name, value_type,
|
||||
)?;
|
||||
for b in b'a'..=b'z' {
|
||||
if let Some(child) = n.get(&b) {
|
||||
let c = b as char;
|
||||
let next_start = format!("{}{}", start, c);
|
||||
writeln!(file, " Some(&{}),", gen_node_name(prefix, &next_start))?;
|
||||
nodes.push((next_start, child));
|
||||
} else {
|
||||
writeln!(file, " None,")?;
|
||||
}
|
||||
}
|
||||
writeln!(file, "];")?;
|
||||
}
|
||||
}
|
||||
DynChild::Flat(v) if v.len() < limit => (),
|
||||
DynChild::Flat(v) => {
|
||||
let mut old_v = Vec::new();
|
||||
std::mem::swap(&mut old_v, v);
|
||||
let mut nodes = std::collections::BTreeMap::new();
|
||||
for (key, value) in old_v {
|
||||
assert!(!key.is_empty());
|
||||
let start = key[0].to_ascii_lowercase();
|
||||
assert!(start.is_ascii_alphabetic());
|
||||
let node = nodes.entry(start).or_insert_with(|| DynNode {
|
||||
children: DynChild::Flat(Vec::new()),
|
||||
value: None,
|
||||
DynChild::Flat(v) => {
|
||||
let table_input = v.iter().map(|(k, v)| {
|
||||
let k = std::str::from_utf8(k).expect("this was originally a `str`");
|
||||
(k, v)
|
||||
});
|
||||
let remaining = &key[1..];
|
||||
if remaining.is_empty() {
|
||||
assert!(node.value.is_none());
|
||||
node.value = Some(value);
|
||||
crate::generate_table(file, &children_name, value_type, table_input)?;
|
||||
}
|
||||
}
|
||||
writeln!(file)?;
|
||||
writeln!(file)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn gen_node_name(prefix: &str, start: &str) -> String {
|
||||
if start.is_empty() {
|
||||
format!("{}_NODE", prefix)
|
||||
} else {
|
||||
let mut start = start.to_owned();
|
||||
start.make_ascii_uppercase();
|
||||
format!("{}_{}_NODE", prefix, start)
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_children_name(prefix: &str, start: &str) -> String {
|
||||
if start.is_empty() {
|
||||
format!("{}_CHILDREN", prefix)
|
||||
} else {
|
||||
let mut start = start.to_owned();
|
||||
start.make_ascii_uppercase();
|
||||
format!("{}_{}_CHILDREN", prefix, start)
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
|
||||
match leaf {
|
||||
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
|
||||
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
|
||||
}
|
||||
}
|
||||
|
||||
struct DynRoot<'s, V> {
|
||||
root: DynNode<'s, V>,
|
||||
unicode: Vec<(&'s str, V)>,
|
||||
range: std::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<'s, V> DynRoot<'s, V> {
|
||||
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
|
||||
let mut overflow = Vec::new();
|
||||
let mut unicode = Vec::default();
|
||||
let mut smallest = usize::MAX;
|
||||
let mut largest = usize::MIN;
|
||||
let mut existing = std::collections::HashSet::new();
|
||||
let mut empty = None;
|
||||
for (key, value) in data {
|
||||
if existing.contains(key) {
|
||||
panic!("Duplicate present: {}", key);
|
||||
}
|
||||
existing.insert(key);
|
||||
|
||||
if key.is_empty() {
|
||||
empty = Some(value);
|
||||
} else {
|
||||
smallest = std::cmp::min(smallest, key.len());
|
||||
largest = std::cmp::max(largest, key.len());
|
||||
if key.bytes().all(|b| b.is_ascii_alphabetic()) {
|
||||
overflow.push((key.as_bytes(), value));
|
||||
} else {
|
||||
match &mut node.children {
|
||||
DynChild::Nested(_) => {
|
||||
unreachable!("Only overflow at this point")
|
||||
}
|
||||
DynChild::Flat(ref mut v) => {
|
||||
v.push((remaining, value));
|
||||
unicode.push((key, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
Self {
|
||||
root: DynNode {
|
||||
children: DynChild::Flat(overflow),
|
||||
value: empty,
|
||||
},
|
||||
unicode,
|
||||
range: smallest..=largest,
|
||||
}
|
||||
}
|
||||
|
||||
fn burst(&mut self, limit: usize) {
|
||||
self.root.burst(limit);
|
||||
}
|
||||
}
|
||||
|
||||
struct DynNode<'s, V> {
|
||||
children: DynChild<'s, V>,
|
||||
value: Option<V>,
|
||||
}
|
||||
|
||||
impl<'s, V> DynNode<'s, V> {
|
||||
fn burst(&mut self, limit: usize) {
|
||||
self.children.burst(limit)
|
||||
}
|
||||
}
|
||||
|
||||
enum DynChild<'s, V> {
|
||||
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
|
||||
Flat(Vec<(&'s [u8], V)>),
|
||||
}
|
||||
|
||||
impl<'s, V> DynChild<'s, V> {
|
||||
fn burst(&mut self, limit: usize) {
|
||||
match self {
|
||||
DynChild::Nested(children) => {
|
||||
for child in children.values_mut() {
|
||||
child.burst(limit);
|
||||
}
|
||||
}
|
||||
DynChild::Flat(v) if v.len() < limit => (),
|
||||
DynChild::Flat(v) => {
|
||||
let mut old_v = Vec::new();
|
||||
std::mem::swap(&mut old_v, v);
|
||||
let mut nodes = std::collections::BTreeMap::new();
|
||||
for (key, value) in old_v {
|
||||
assert!(!key.is_empty());
|
||||
let start = key[0].to_ascii_lowercase();
|
||||
assert!(start.is_ascii_alphabetic());
|
||||
let node = nodes.entry(start).or_insert_with(|| DynNode {
|
||||
children: DynChild::Flat(Vec::new()),
|
||||
value: None,
|
||||
});
|
||||
let remaining = &key[1..];
|
||||
if remaining.is_empty() {
|
||||
assert!(node.value.is_none());
|
||||
node.value = Some(value);
|
||||
} else {
|
||||
match &mut node.children {
|
||||
DynChild::Nested(_) => {
|
||||
unreachable!("Only overflow at this point")
|
||||
}
|
||||
DynChild::Flat(ref mut v) => {
|
||||
v.push((remaining, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*self = DynChild::Nested(nodes);
|
||||
self.burst(limit);
|
||||
}
|
||||
*self = DynChild::Nested(nodes);
|
||||
self.burst(limit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,4 +23,4 @@ itertools = "0.10"
|
|||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
regex = "1"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -23,4 +23,4 @@ itertools = "0.10"
|
|||
unicase = "2.5"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -29,4 +29,4 @@ log = "0.4"
|
|||
env_logger = "0.7"
|
||||
clap-verbosity-flag = "0.3"
|
||||
itertools = "0.10"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -22,4 +22,4 @@ unicase = "2.5"
|
|||
itertools = "0.10"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
Loading…
Add table
Reference in a new issue