refactor(dict): Separate dictgen concerns

This commit is contained in:
Ed Page 2021-07-01 08:52:36 -05:00
parent 97015b3a95
commit 3b43272724
8 changed files with 233 additions and 208 deletions

View file

@ -22,4 +22,4 @@ unicase = "2.5"
itertools = "0.10" itertools = "0.10"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" } dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -1,7 +1,17 @@
[package] [package]
name = "dictgen" name = "dictgen"
version = "0.1.0" version = "0.1.0"
description = "Compile-time case-insensitive map"
repository = "https://github.com/crate-ci/typos"
categories = ["development-tools", "text-processing"]
keywords = ["development", "spelling", "no_std"]
license = "MIT"
edition = "2018" edition = "2018"
[features]
default = ["std", "codegen"]
std = []
codegen = ["std"]
[dependencies] [dependencies]
unicase = "2.5" unicase = "2.5"

View file

@ -1,3 +1,4 @@
#[cfg(feature = "codegen")]
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>( pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W, file: &mut W,
name: &str, name: &str,
@ -46,7 +47,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
pub struct DictTable<V: 'static> { pub struct DictTable<V: 'static> {
pub keys: &'static [InsensitiveStr], pub keys: &'static [InsensitiveStr],
pub values: &'static [V], pub values: &'static [V],
pub range: std::ops::RangeInclusive<usize>, pub range: core::ops::RangeInclusive<usize>,
} }
impl<V> DictTable<V> { impl<V> DictTable<V> {

View file

@ -1,6 +1,7 @@
/// # Panics /// # Panics
/// ///
/// - On duplicate entry /// - On duplicate entry
#[cfg(feature = "codegen")]
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W, file: &mut W,
prefix: &str, prefix: &str,
@ -8,6 +9,80 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
data: impl Iterator<Item = (&'d str, V)>, data: impl Iterator<Item = (&'d str, V)>,
limit: usize, limit: usize,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
codegen::generate_trie(file, prefix, value_type, data, limit)
}
pub struct DictTrie<V: 'static> {
pub root: &'static DictTrieNode<V>,
pub unicode: &'static crate::DictTable<V>,
pub range: core::ops::RangeInclusive<usize>,
}
impl<V> DictTrie<V> {
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
let bytes = word.as_bytes();
let mut child = &self.root;
for i in 0..bytes.len() {
match child.children {
DictTrieChild::Nested(n) => {
let byte = bytes[i];
let index = if (b'a'..b'z').contains(&byte) {
byte - b'a'
} else if (b'A'..b'Z').contains(&byte) {
byte - b'A'
} else {
return self.unicode.find(word);
};
debug_assert!(index < 26);
if let Some(next) = n[index as usize].as_ref() {
child = next;
} else {
return None;
}
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
}
}
}
child.value.as_ref()
} else {
None
}
}
}
pub struct DictTrieNode<V: 'static> {
pub children: DictTrieChild<V>,
pub value: Option<V>,
}
pub enum DictTrieChild<V: 'static> {
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
Flat(&'static crate::DictTable<V>),
}
#[cfg(feature = "codegen")]
mod codegen {
pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
file: &mut W,
prefix: &str,
value_type: &str,
data: impl Iterator<Item = (&'d str, V)>,
limit: usize,
) -> Result<(), std::io::Error> {
let mut root = DynRoot::new(data); let mut root = DynRoot::new(data);
root.burst(limit); root.burst(limit);
@ -92,9 +167,9 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
} }
Ok(()) Ok(())
} }
fn gen_node_name(prefix: &str, start: &str) -> String { fn gen_node_name(prefix: &str, start: &str) -> String {
if start.is_empty() { if start.is_empty() {
format!("{}_NODE", prefix) format!("{}_NODE", prefix)
} else { } else {
@ -102,9 +177,9 @@ fn gen_node_name(prefix: &str, start: &str) -> String {
start.make_ascii_uppercase(); start.make_ascii_uppercase();
format!("{}_{}_NODE", prefix, start) format!("{}_{}_NODE", prefix, start)
} }
} }
fn gen_children_name(prefix: &str, start: &str) -> String { fn gen_children_name(prefix: &str, start: &str) -> String {
if start.is_empty() { if start.is_empty() {
format!("{}_CHILDREN", prefix) format!("{}_CHILDREN", prefix)
} else { } else {
@ -112,84 +187,22 @@ fn gen_children_name(prefix: &str, start: &str) -> String {
start.make_ascii_uppercase(); start.make_ascii_uppercase();
format!("{}_{}_CHILDREN", prefix, start) format!("{}_{}_CHILDREN", prefix, start)
} }
} }
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str { fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
match leaf { match leaf {
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested", DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat", DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
} }
} }
pub struct DictTrie<V: 'static> { struct DynRoot<'s, V> {
pub root: &'static DictTrieNode<V>,
pub unicode: &'static crate::DictTable<V>,
pub range: std::ops::RangeInclusive<usize>,
}
impl<V> DictTrie<V> {
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
if self.range.contains(&word.len()) {
let bytes = word.as_bytes();
let mut child = &self.root;
for i in 0..bytes.len() {
match child.children {
DictTrieChild::Nested(n) => {
let byte = bytes[i];
let index = if (b'a'..b'z').contains(&byte) {
byte - b'a'
} else if (b'A'..b'Z').contains(&byte) {
byte - b'A'
} else {
return self.unicode.find(word);
};
debug_assert!(index < 26);
if let Some(next) = n[index as usize].as_ref() {
child = next;
} else {
return None;
}
}
DictTrieChild::Flat(t) => {
let remaining = &bytes[i..bytes.len()];
// Unsafe: Everything before has been proven to be ASCII, so this should be
// safe.
let remaining = unsafe { std::str::from_utf8_unchecked(remaining) };
// Reuse the prior ascii check, rather than doing it again
let remaining = if word.is_ascii() {
unicase::UniCase::ascii(remaining)
} else {
unicase::UniCase::unicode(remaining)
};
return t.find(&remaining);
}
}
}
child.value.as_ref()
} else {
None
}
}
}
pub struct DictTrieNode<V: 'static> {
pub children: DictTrieChild<V>,
pub value: Option<V>,
}
pub enum DictTrieChild<V: 'static> {
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
Flat(&'static crate::DictTable<V>),
}
struct DynRoot<'s, V> {
root: DynNode<'s, V>, root: DynNode<'s, V>,
unicode: Vec<(&'s str, V)>, unicode: Vec<(&'s str, V)>,
range: std::ops::RangeInclusive<usize>, range: std::ops::RangeInclusive<usize>,
} }
impl<'s, V> DynRoot<'s, V> { impl<'s, V> DynRoot<'s, V> {
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self { fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
let mut overflow = Vec::new(); let mut overflow = Vec::new();
let mut unicode = Vec::default(); let mut unicode = Vec::default();
@ -228,25 +241,25 @@ impl<'s, V> DynRoot<'s, V> {
fn burst(&mut self, limit: usize) { fn burst(&mut self, limit: usize) {
self.root.burst(limit); self.root.burst(limit);
} }
} }
struct DynNode<'s, V> { struct DynNode<'s, V> {
children: DynChild<'s, V>, children: DynChild<'s, V>,
value: Option<V>, value: Option<V>,
} }
impl<'s, V> DynNode<'s, V> { impl<'s, V> DynNode<'s, V> {
fn burst(&mut self, limit: usize) { fn burst(&mut self, limit: usize) {
self.children.burst(limit) self.children.burst(limit)
} }
} }
enum DynChild<'s, V> { enum DynChild<'s, V> {
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>), Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
Flat(Vec<(&'s [u8], V)>), Flat(Vec<(&'s [u8], V)>),
} }
impl<'s, V> DynChild<'s, V> { impl<'s, V> DynChild<'s, V> {
fn burst(&mut self, limit: usize) { fn burst(&mut self, limit: usize) {
match self { match self {
DynChild::Nested(children) => { DynChild::Nested(children) => {
@ -287,4 +300,5 @@ impl<'s, V> DynChild<'s, V> {
} }
} }
} }
}
} }

View file

@ -23,4 +23,4 @@ itertools = "0.10"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
regex = "1" regex = "1"
dictgen = { version = "0.1", path = "../../dictgen" } dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -23,4 +23,4 @@ itertools = "0.10"
unicase = "2.5" unicase = "2.5"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" } dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -29,4 +29,4 @@ log = "0.4"
env_logger = "0.7" env_logger = "0.7"
clap-verbosity-flag = "0.3" clap-verbosity-flag = "0.3"
itertools = "0.10" itertools = "0.10"
dictgen = { version = "0.1", path = "../../dictgen" } dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }

View file

@ -22,4 +22,4 @@ unicase = "2.5"
itertools = "0.10" itertools = "0.10"
codegenrs = "1.0" codegenrs = "1.0"
structopt = "0.3" structopt = "0.3"
dictgen = { version = "0.1", path = "../../dictgen" } dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }