mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-24 18:10:56 -05:00
refactor(dict): Separate dictgen concerns
This commit is contained in:
parent
97015b3a95
commit
3b43272724
8 changed files with 233 additions and 208 deletions
|
@ -22,4 +22,4 @@ unicase = "2.5"
|
|||
itertools = "0.10"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -1,7 +1,17 @@
|
|||
[package]
|
||||
name = "dictgen"
|
||||
version = "0.1.0"
|
||||
description = "Compile-time case-insensitive map"
|
||||
repository = "https://github.com/crate-ci/typos"
|
||||
categories = ["development-tools", "text-processing"]
|
||||
keywords = ["development", "spelling", "no_std"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
|
||||
[features]
|
||||
default = ["std", "codegen"]
|
||||
std = []
|
||||
codegen = ["std"]
|
||||
|
||||
[dependencies]
|
||||
unicase = "2.5"
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#[cfg(feature = "codegen")]
|
||||
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||
file: &mut W,
|
||||
name: &str,
|
||||
|
@ -46,7 +47,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
|||
pub struct DictTable<V: 'static> {
|
||||
pub keys: &'static [InsensitiveStr],
|
||||
pub values: &'static [V],
|
||||
pub range: std::ops::RangeInclusive<usize>,
|
||||
pub range: core::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<V> DictTable<V> {
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/// # Panics
|
||||
///
|
||||
/// - On duplicate entry
|
||||
#[cfg(feature = "codegen")]
|
||||
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||
file: &mut W,
|
||||
prefix: &str,
|
||||
|
@ -8,6 +9,80 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
|||
data: impl Iterator<Item = (&'d str, V)>,
|
||||
limit: usize,
|
||||
) -> Result<(), std::io::Error> {
|
||||
codegen::generate_trie(file, prefix, value_type, data, limit)
|
||||
}
|
||||
|
||||
pub struct DictTrie<V: 'static> {
|
||||
pub root: &'static DictTrieNode<V>,
|
||||
pub unicode: &'static crate::DictTable<V>,
|
||||
pub range: core::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<V> DictTrie<V> {
|
||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
||||
if self.range.contains(&word.len()) {
|
||||
let bytes = word.as_bytes();
|
||||
|
||||
let mut child = &self.root;
|
||||
for i in 0..bytes.len() {
|
||||
match child.children {
|
||||
DictTrieChild::Nested(n) => {
|
||||
let byte = bytes[i];
|
||||
let index = if (b'a'..b'z').contains(&byte) {
|
||||
byte - b'a'
|
||||
} else if (b'A'..b'Z').contains(&byte) {
|
||||
byte - b'A'
|
||||
} else {
|
||||
return self.unicode.find(word);
|
||||
};
|
||||
debug_assert!(index < 26);
|
||||
if let Some(next) = n[index as usize].as_ref() {
|
||||
child = next;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
DictTrieChild::Flat(t) => {
|
||||
let remaining = &bytes[i..bytes.len()];
|
||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||
// safe.
|
||||
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
||||
// Reuse the prior ascii check, rather than doing it again
|
||||
let remaining = if word.is_ascii() {
|
||||
unicase::UniCase::ascii(remaining)
|
||||
} else {
|
||||
unicase::UniCase::unicode(remaining)
|
||||
};
|
||||
return t.find(&remaining);
|
||||
}
|
||||
}
|
||||
}
|
||||
child.value.as_ref()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DictTrieNode<V: 'static> {
|
||||
pub children: DictTrieChild<V>,
|
||||
pub value: Option<V>,
|
||||
}
|
||||
|
||||
pub enum DictTrieChild<V: 'static> {
|
||||
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
|
||||
Flat(&'static crate::DictTable<V>),
|
||||
}
|
||||
|
||||
#[cfg(feature = "codegen")]
|
||||
mod codegen {
|
||||
pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||
file: &mut W,
|
||||
prefix: &str,
|
||||
value_type: &str,
|
||||
data: impl Iterator<Item = (&'d str, V)>,
|
||||
limit: usize,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut root = DynRoot::new(data);
|
||||
root.burst(limit);
|
||||
|
||||
|
@ -92,9 +167,9 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
|||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_node_name(prefix: &str, start: &str) -> String {
|
||||
fn gen_node_name(prefix: &str, start: &str) -> String {
|
||||
if start.is_empty() {
|
||||
format!("{}_NODE", prefix)
|
||||
} else {
|
||||
|
@ -102,9 +177,9 @@ fn gen_node_name(prefix: &str, start: &str) -> String {
|
|||
start.make_ascii_uppercase();
|
||||
format!("{}_{}_NODE", prefix, start)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_children_name(prefix: &str, start: &str) -> String {
|
||||
fn gen_children_name(prefix: &str, start: &str) -> String {
|
||||
if start.is_empty() {
|
||||
format!("{}_CHILDREN", prefix)
|
||||
} else {
|
||||
|
@ -112,84 +187,22 @@ fn gen_children_name(prefix: &str, start: &str) -> String {
|
|||
start.make_ascii_uppercase();
|
||||
format!("{}_{}_CHILDREN", prefix, start)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
|
||||
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
|
||||
match leaf {
|
||||
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
|
||||
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DictTrie<V: 'static> {
|
||||
pub root: &'static DictTrieNode<V>,
|
||||
pub unicode: &'static crate::DictTable<V>,
|
||||
pub range: std::ops::RangeInclusive<usize>,
|
||||
}
|
||||
|
||||
impl<V> DictTrie<V> {
|
||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
||||
if self.range.contains(&word.len()) {
|
||||
let bytes = word.as_bytes();
|
||||
|
||||
let mut child = &self.root;
|
||||
for i in 0..bytes.len() {
|
||||
match child.children {
|
||||
DictTrieChild::Nested(n) => {
|
||||
let byte = bytes[i];
|
||||
let index = if (b'a'..b'z').contains(&byte) {
|
||||
byte - b'a'
|
||||
} else if (b'A'..b'Z').contains(&byte) {
|
||||
byte - b'A'
|
||||
} else {
|
||||
return self.unicode.find(word);
|
||||
};
|
||||
debug_assert!(index < 26);
|
||||
if let Some(next) = n[index as usize].as_ref() {
|
||||
child = next;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
DictTrieChild::Flat(t) => {
|
||||
let remaining = &bytes[i..bytes.len()];
|
||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||
// safe.
|
||||
let remaining = unsafe { std::str::from_utf8_unchecked(remaining) };
|
||||
// Reuse the prior ascii check, rather than doing it again
|
||||
let remaining = if word.is_ascii() {
|
||||
unicase::UniCase::ascii(remaining)
|
||||
} else {
|
||||
unicase::UniCase::unicode(remaining)
|
||||
};
|
||||
return t.find(&remaining);
|
||||
}
|
||||
}
|
||||
}
|
||||
child.value.as_ref()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DictTrieNode<V: 'static> {
|
||||
pub children: DictTrieChild<V>,
|
||||
pub value: Option<V>,
|
||||
}
|
||||
|
||||
pub enum DictTrieChild<V: 'static> {
|
||||
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
|
||||
Flat(&'static crate::DictTable<V>),
|
||||
}
|
||||
|
||||
struct DynRoot<'s, V> {
|
||||
struct DynRoot<'s, V> {
|
||||
root: DynNode<'s, V>,
|
||||
unicode: Vec<(&'s str, V)>,
|
||||
range: std::ops::RangeInclusive<usize>,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s, V> DynRoot<'s, V> {
|
||||
impl<'s, V> DynRoot<'s, V> {
|
||||
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
|
||||
let mut overflow = Vec::new();
|
||||
let mut unicode = Vec::default();
|
||||
|
@ -228,25 +241,25 @@ impl<'s, V> DynRoot<'s, V> {
|
|||
fn burst(&mut self, limit: usize) {
|
||||
self.root.burst(limit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct DynNode<'s, V> {
|
||||
struct DynNode<'s, V> {
|
||||
children: DynChild<'s, V>,
|
||||
value: Option<V>,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s, V> DynNode<'s, V> {
|
||||
impl<'s, V> DynNode<'s, V> {
|
||||
fn burst(&mut self, limit: usize) {
|
||||
self.children.burst(limit)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum DynChild<'s, V> {
|
||||
enum DynChild<'s, V> {
|
||||
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
|
||||
Flat(Vec<(&'s [u8], V)>),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s, V> DynChild<'s, V> {
|
||||
impl<'s, V> DynChild<'s, V> {
|
||||
fn burst(&mut self, limit: usize) {
|
||||
match self {
|
||||
DynChild::Nested(children) => {
|
||||
|
@ -287,4 +300,5 @@ impl<'s, V> DynChild<'s, V> {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,4 +23,4 @@ itertools = "0.10"
|
|||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
regex = "1"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -23,4 +23,4 @@ itertools = "0.10"
|
|||
unicase = "2.5"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -29,4 +29,4 @@ log = "0.4"
|
|||
env_logger = "0.7"
|
||||
clap-verbosity-flag = "0.3"
|
||||
itertools = "0.10"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
|
@ -22,4 +22,4 @@ unicase = "2.5"
|
|||
itertools = "0.10"
|
||||
codegenrs = "1.0"
|
||||
structopt = "0.3"
|
||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
||||
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||
|
|
Loading…
Reference in a new issue