mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-24 18:10:56 -05:00
refactor(dict): Separate dictgen concerns
This commit is contained in:
parent
97015b3a95
commit
3b43272724
8 changed files with 233 additions and 208 deletions
|
@ -22,4 +22,4 @@ unicase = "2.5"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
codegenrs = "1.0"
|
codegenrs = "1.0"
|
||||||
structopt = "0.3"
|
structopt = "0.3"
|
||||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||||
|
|
|
@ -1,7 +1,17 @@
|
||||||
[package]
|
[package]
|
||||||
name = "dictgen"
|
name = "dictgen"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
description = "Compile-time case-insensitive map"
|
||||||
|
repository = "https://github.com/crate-ci/typos"
|
||||||
|
categories = ["development-tools", "text-processing"]
|
||||||
|
keywords = ["development", "spelling", "no_std"]
|
||||||
|
license = "MIT"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["std", "codegen"]
|
||||||
|
std = []
|
||||||
|
codegen = ["std"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
unicase = "2.5"
|
unicase = "2.5"
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||||
file: &mut W,
|
file: &mut W,
|
||||||
name: &str,
|
name: &str,
|
||||||
|
@ -46,7 +47,7 @@ pub fn generate_table<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||||
pub struct DictTable<V: 'static> {
|
pub struct DictTable<V: 'static> {
|
||||||
pub keys: &'static [InsensitiveStr],
|
pub keys: &'static [InsensitiveStr],
|
||||||
pub values: &'static [V],
|
pub values: &'static [V],
|
||||||
pub range: std::ops::RangeInclusive<usize>,
|
pub range: core::ops::RangeInclusive<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<V> DictTable<V> {
|
impl<V> DictTable<V> {
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
/// # Panics
|
/// # Panics
|
||||||
///
|
///
|
||||||
/// - On duplicate entry
|
/// - On duplicate entry
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||||
file: &mut W,
|
file: &mut W,
|
||||||
prefix: &str,
|
prefix: &str,
|
||||||
|
@ -8,6 +9,80 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||||
data: impl Iterator<Item = (&'d str, V)>,
|
data: impl Iterator<Item = (&'d str, V)>,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
|
codegen::generate_trie(file, prefix, value_type, data, limit)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DictTrie<V: 'static> {
|
||||||
|
pub root: &'static DictTrieNode<V>,
|
||||||
|
pub unicode: &'static crate::DictTable<V>,
|
||||||
|
pub range: core::ops::RangeInclusive<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<V> DictTrie<V> {
|
||||||
|
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
||||||
|
if self.range.contains(&word.len()) {
|
||||||
|
let bytes = word.as_bytes();
|
||||||
|
|
||||||
|
let mut child = &self.root;
|
||||||
|
for i in 0..bytes.len() {
|
||||||
|
match child.children {
|
||||||
|
DictTrieChild::Nested(n) => {
|
||||||
|
let byte = bytes[i];
|
||||||
|
let index = if (b'a'..b'z').contains(&byte) {
|
||||||
|
byte - b'a'
|
||||||
|
} else if (b'A'..b'Z').contains(&byte) {
|
||||||
|
byte - b'A'
|
||||||
|
} else {
|
||||||
|
return self.unicode.find(word);
|
||||||
|
};
|
||||||
|
debug_assert!(index < 26);
|
||||||
|
if let Some(next) = n[index as usize].as_ref() {
|
||||||
|
child = next;
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DictTrieChild::Flat(t) => {
|
||||||
|
let remaining = &bytes[i..bytes.len()];
|
||||||
|
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
||||||
|
// safe.
|
||||||
|
let remaining = unsafe { core::str::from_utf8_unchecked(remaining) };
|
||||||
|
// Reuse the prior ascii check, rather than doing it again
|
||||||
|
let remaining = if word.is_ascii() {
|
||||||
|
unicase::UniCase::ascii(remaining)
|
||||||
|
} else {
|
||||||
|
unicase::UniCase::unicode(remaining)
|
||||||
|
};
|
||||||
|
return t.find(&remaining);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
child.value.as_ref()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DictTrieNode<V: 'static> {
|
||||||
|
pub children: DictTrieChild<V>,
|
||||||
|
pub value: Option<V>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum DictTrieChild<V: 'static> {
|
||||||
|
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
|
||||||
|
Flat(&'static crate::DictTable<V>),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "codegen")]
|
||||||
|
mod codegen {
|
||||||
|
pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||||
|
file: &mut W,
|
||||||
|
prefix: &str,
|
||||||
|
value_type: &str,
|
||||||
|
data: impl Iterator<Item = (&'d str, V)>,
|
||||||
|
limit: usize,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
let mut root = DynRoot::new(data);
|
let mut root = DynRoot::new(data);
|
||||||
root.burst(limit);
|
root.burst(limit);
|
||||||
|
|
||||||
|
@ -92,9 +167,9 @@ pub fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>(
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn gen_node_name(prefix: &str, start: &str) -> String {
|
fn gen_node_name(prefix: &str, start: &str) -> String {
|
||||||
if start.is_empty() {
|
if start.is_empty() {
|
||||||
format!("{}_NODE", prefix)
|
format!("{}_NODE", prefix)
|
||||||
} else {
|
} else {
|
||||||
|
@ -102,9 +177,9 @@ fn gen_node_name(prefix: &str, start: &str) -> String {
|
||||||
start.make_ascii_uppercase();
|
start.make_ascii_uppercase();
|
||||||
format!("{}_{}_NODE", prefix, start)
|
format!("{}_{}_NODE", prefix, start)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn gen_children_name(prefix: &str, start: &str) -> String {
|
fn gen_children_name(prefix: &str, start: &str) -> String {
|
||||||
if start.is_empty() {
|
if start.is_empty() {
|
||||||
format!("{}_CHILDREN", prefix)
|
format!("{}_CHILDREN", prefix)
|
||||||
} else {
|
} else {
|
||||||
|
@ -112,84 +187,22 @@ fn gen_children_name(prefix: &str, start: &str) -> String {
|
||||||
start.make_ascii_uppercase();
|
start.make_ascii_uppercase();
|
||||||
format!("{}_{}_CHILDREN", prefix, start)
|
format!("{}_{}_CHILDREN", prefix, start)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
|
fn gen_type_name<V>(leaf: &DynChild<V>) -> &'static str {
|
||||||
match leaf {
|
match leaf {
|
||||||
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
|
DynChild::Nested(_) => "dictgen::DictTrieChild::Nested",
|
||||||
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
|
DynChild::Flat(_) => "dictgen::DictTrieChild::Flat",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct DictTrie<V: 'static> {
|
struct DynRoot<'s, V> {
|
||||||
pub root: &'static DictTrieNode<V>,
|
|
||||||
pub unicode: &'static crate::DictTable<V>,
|
|
||||||
pub range: std::ops::RangeInclusive<usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<V> DictTrie<V> {
|
|
||||||
pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> {
|
|
||||||
if self.range.contains(&word.len()) {
|
|
||||||
let bytes = word.as_bytes();
|
|
||||||
|
|
||||||
let mut child = &self.root;
|
|
||||||
for i in 0..bytes.len() {
|
|
||||||
match child.children {
|
|
||||||
DictTrieChild::Nested(n) => {
|
|
||||||
let byte = bytes[i];
|
|
||||||
let index = if (b'a'..b'z').contains(&byte) {
|
|
||||||
byte - b'a'
|
|
||||||
} else if (b'A'..b'Z').contains(&byte) {
|
|
||||||
byte - b'A'
|
|
||||||
} else {
|
|
||||||
return self.unicode.find(word);
|
|
||||||
};
|
|
||||||
debug_assert!(index < 26);
|
|
||||||
if let Some(next) = n[index as usize].as_ref() {
|
|
||||||
child = next;
|
|
||||||
} else {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DictTrieChild::Flat(t) => {
|
|
||||||
let remaining = &bytes[i..bytes.len()];
|
|
||||||
// Unsafe: Everything before has been proven to be ASCII, so this should be
|
|
||||||
// safe.
|
|
||||||
let remaining = unsafe { std::str::from_utf8_unchecked(remaining) };
|
|
||||||
// Reuse the prior ascii check, rather than doing it again
|
|
||||||
let remaining = if word.is_ascii() {
|
|
||||||
unicase::UniCase::ascii(remaining)
|
|
||||||
} else {
|
|
||||||
unicase::UniCase::unicode(remaining)
|
|
||||||
};
|
|
||||||
return t.find(&remaining);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
child.value.as_ref()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct DictTrieNode<V: 'static> {
|
|
||||||
pub children: DictTrieChild<V>,
|
|
||||||
pub value: Option<V>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum DictTrieChild<V: 'static> {
|
|
||||||
Nested(&'static [Option<&'static DictTrieNode<V>>; 26]),
|
|
||||||
Flat(&'static crate::DictTable<V>),
|
|
||||||
}
|
|
||||||
|
|
||||||
struct DynRoot<'s, V> {
|
|
||||||
root: DynNode<'s, V>,
|
root: DynNode<'s, V>,
|
||||||
unicode: Vec<(&'s str, V)>,
|
unicode: Vec<(&'s str, V)>,
|
||||||
range: std::ops::RangeInclusive<usize>,
|
range: std::ops::RangeInclusive<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s, V> DynRoot<'s, V> {
|
impl<'s, V> DynRoot<'s, V> {
|
||||||
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
|
fn new(data: impl Iterator<Item = (&'s str, V)>) -> Self {
|
||||||
let mut overflow = Vec::new();
|
let mut overflow = Vec::new();
|
||||||
let mut unicode = Vec::default();
|
let mut unicode = Vec::default();
|
||||||
|
@ -228,25 +241,25 @@ impl<'s, V> DynRoot<'s, V> {
|
||||||
fn burst(&mut self, limit: usize) {
|
fn burst(&mut self, limit: usize) {
|
||||||
self.root.burst(limit);
|
self.root.burst(limit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct DynNode<'s, V> {
|
struct DynNode<'s, V> {
|
||||||
children: DynChild<'s, V>,
|
children: DynChild<'s, V>,
|
||||||
value: Option<V>,
|
value: Option<V>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s, V> DynNode<'s, V> {
|
impl<'s, V> DynNode<'s, V> {
|
||||||
fn burst(&mut self, limit: usize) {
|
fn burst(&mut self, limit: usize) {
|
||||||
self.children.burst(limit)
|
self.children.burst(limit)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum DynChild<'s, V> {
|
enum DynChild<'s, V> {
|
||||||
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
|
Nested(std::collections::BTreeMap<u8, DynNode<'s, V>>),
|
||||||
Flat(Vec<(&'s [u8], V)>),
|
Flat(Vec<(&'s [u8], V)>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s, V> DynChild<'s, V> {
|
impl<'s, V> DynChild<'s, V> {
|
||||||
fn burst(&mut self, limit: usize) {
|
fn burst(&mut self, limit: usize) {
|
||||||
match self {
|
match self {
|
||||||
DynChild::Nested(children) => {
|
DynChild::Nested(children) => {
|
||||||
|
@ -287,4 +300,5 @@ impl<'s, V> DynChild<'s, V> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,4 @@ itertools = "0.10"
|
||||||
codegenrs = "1.0"
|
codegenrs = "1.0"
|
||||||
structopt = "0.3"
|
structopt = "0.3"
|
||||||
regex = "1"
|
regex = "1"
|
||||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||||
|
|
|
@ -23,4 +23,4 @@ itertools = "0.10"
|
||||||
unicase = "2.5"
|
unicase = "2.5"
|
||||||
codegenrs = "1.0"
|
codegenrs = "1.0"
|
||||||
structopt = "0.3"
|
structopt = "0.3"
|
||||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||||
|
|
|
@ -29,4 +29,4 @@ log = "0.4"
|
||||||
env_logger = "0.7"
|
env_logger = "0.7"
|
||||||
clap-verbosity-flag = "0.3"
|
clap-verbosity-flag = "0.3"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||||
|
|
|
@ -22,4 +22,4 @@ unicase = "2.5"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
codegenrs = "1.0"
|
codegenrs = "1.0"
|
||||||
structopt = "0.3"
|
structopt = "0.3"
|
||||||
dictgen = { version = "0.1", path = "../../dictgen" }
|
dictgen = { version = "0.1", path = "../../dictgen", features = ["codegen"] }
|
||||||
|
|
Loading…
Reference in a new issue