mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-28 12:01:06 -05:00
refactor(parser): Unify str/bytes code paths
The main goal is to support replacing the parser with `nom` where I need access to `str` only functionality. With crates like simdutf8, this might also offer up performance gains since they see the biggest benefit when doing large blocks of validation.
This commit is contained in:
parent
fce11d6c35
commit
819702c82f
1 changed files with 60 additions and 17 deletions
|
@ -50,11 +50,9 @@ impl TokenizerBuilder {
|
||||||
pattern.push_str(r#"*)\b"#);
|
pattern.push_str(r#"*)\b"#);
|
||||||
|
|
||||||
let words_str = regex::Regex::new(&pattern).unwrap();
|
let words_str = regex::Regex::new(&pattern).unwrap();
|
||||||
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
|
|
||||||
|
|
||||||
Tokenizer {
|
Tokenizer {
|
||||||
words_str,
|
words_str,
|
||||||
words_bytes,
|
|
||||||
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
|
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
|
||||||
// hexadecimal number without a leading digit.
|
// hexadecimal number without a leading digit.
|
||||||
ignore_numbers: self.leading_digits,
|
ignore_numbers: self.leading_digits,
|
||||||
|
@ -91,7 +89,6 @@ impl Default for TokenizerBuilder {
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Tokenizer {
|
pub struct Tokenizer {
|
||||||
words_str: regex::Regex,
|
words_str: regex::Regex,
|
||||||
words_bytes: regex::bytes::Regex,
|
|
||||||
ignore_numbers: bool,
|
ignore_numbers: bool,
|
||||||
ignore_hex: bool,
|
ignore_hex: bool,
|
||||||
}
|
}
|
||||||
|
@ -104,21 +101,20 @@ impl Tokenizer {
|
||||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
self.words_str
|
self.words_str
|
||||||
.find_iter(content)
|
.find_iter(content)
|
||||||
.filter(move |m| self.accept(m.as_str().as_bytes()))
|
.filter(move |m| self.accept(m.as_str()))
|
||||||
.map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start()))
|
.map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
self.words_bytes
|
Utf8Chunks::new(content).flat_map(move |c| {
|
||||||
.find_iter(content)
|
let chunk_offset = offset(content, c.as_bytes());
|
||||||
.filter(move |m| self.accept(m.as_bytes()))
|
self.parse_str(c).map(move |i| {
|
||||||
.filter_map(|m| {
|
Identifier::new_unchecked(i.token(), i.case(), i.offset() + chunk_offset)
|
||||||
let s = std::str::from_utf8(m.as_bytes()).ok();
|
|
||||||
s.map(|s| Identifier::new_unchecked(s, Case::None, m.start()))
|
|
||||||
})
|
})
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn accept(&self, contents: &[u8]) -> bool {
|
fn accept(&self, contents: &str) -> bool {
|
||||||
if self.ignore_numbers && is_number(contents) {
|
if self.ignore_numbers && is_number(contents) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -137,21 +133,68 @@ impl Default for Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn offset(base: &[u8], needle: &[u8]) -> usize {
|
||||||
|
let base = base.as_ptr() as usize;
|
||||||
|
let needle = needle.as_ptr() as usize;
|
||||||
|
debug_assert!(base <= needle);
|
||||||
|
needle - base
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Utf8Chunks<'s> {
|
||||||
|
source: &'s [u8],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> Utf8Chunks<'s> {
|
||||||
|
fn new(source: &'s [u8]) -> Self {
|
||||||
|
Self { source }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> Iterator for Utf8Chunks<'s> {
|
||||||
|
type Item = &'s str;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<&'s str> {
|
||||||
|
loop {
|
||||||
|
if self.source.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
match std::str::from_utf8(self.source) {
|
||||||
|
Ok(valid) => {
|
||||||
|
self.source = b"";
|
||||||
|
return Some(valid);
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
let (valid, after_valid) = self.source.split_at(error.valid_up_to());
|
||||||
|
|
||||||
|
if let Some(invalid_sequence_length) = error.error_len() {
|
||||||
|
self.source = &after_valid[invalid_sequence_length..];
|
||||||
|
} else {
|
||||||
|
self.source = b"";
|
||||||
|
}
|
||||||
|
|
||||||
|
let valid = unsafe { std::str::from_utf8_unchecked(valid) };
|
||||||
|
return Some(valid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// `_`: number literal separator in Rust and other languages
|
// `_`: number literal separator in Rust and other languages
|
||||||
// `'`: number literal separator in C++
|
// `'`: number literal separator in C++
|
||||||
static DIGITS: once_cell::sync::Lazy<regex::bytes::Regex> =
|
static DIGITS: once_cell::sync::Lazy<regex::Regex> =
|
||||||
once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^[0-9_']+$"#).unwrap());
|
once_cell::sync::Lazy::new(|| regex::Regex::new(r#"^[0-9_']+$"#).unwrap());
|
||||||
|
|
||||||
fn is_number(ident: &[u8]) -> bool {
|
fn is_number(ident: &str) -> bool {
|
||||||
DIGITS.is_match(ident)
|
DIGITS.is_match(ident)
|
||||||
}
|
}
|
||||||
|
|
||||||
// `_`: number literal separator in Rust and other languages
|
// `_`: number literal separator in Rust and other languages
|
||||||
// `'`: number literal separator in C++
|
// `'`: number literal separator in C++
|
||||||
static HEX: once_cell::sync::Lazy<regex::bytes::Regex> =
|
static HEX: once_cell::sync::Lazy<regex::Regex> =
|
||||||
once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap());
|
once_cell::sync::Lazy::new(|| regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap());
|
||||||
|
|
||||||
fn is_hex(ident: &[u8]) -> bool {
|
fn is_hex(ident: &str) -> bool {
|
||||||
HEX.is_match(ident)
|
HEX.is_match(ident)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue