fix(parser)!: Defer to Unicode XID for identifiers

This saves us from having to have configuration for every detail.  If
people need more control, we can offer it later.

Fixes #225
This commit is contained in:
Ed Page 2021-04-21 20:30:32 -05:00
parent f15cc58f71
commit 9cbc7410a4
6 changed files with 57 additions and 98 deletions

11
Cargo.lock generated
View file

@ -1032,7 +1032,7 @@ version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
dependencies = [
"unicode-xid 0.2.1",
"unicode-xid 0.2.2",
]
[[package]]
@ -1391,7 +1391,7 @@ checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.9",
"unicode-xid 0.2.1",
"unicode-xid 0.2.2",
]
[[package]]
@ -1499,12 +1499,13 @@ dependencies = [
"anyhow",
"itertools 0.10.0",
"log",
"nom",
"once_cell",
"regex",
"serde",
"simdutf8",
"thiserror",
"unicode-segmentation",
"unicode-xid 0.2.2",
]
[[package]]
@ -1638,9 +1639,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
[[package]]
name = "unicode-xid"
version = "0.2.1"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]]
name = "uuid"

View file

@ -17,7 +17,8 @@ codecov = { repository = "crate-ci/typos" }
[dependencies]
anyhow = "1.0"
thiserror = "1.0"
regex = "1.3"
nom = "6.0"
unicode-xid = "0.2.2"
once_cell = "1.2.0"
serde = { version = "1.0", features = ["derive"] }
simdutf8 = "0.1.1"

View file

@ -3,9 +3,6 @@
pub struct TokenizerBuilder {
ignore_hex: bool,
leading_digits: bool,
leading_chars: String,
include_digits: bool,
include_chars: String,
}
impl TokenizerBuilder {
@ -25,60 +22,23 @@ impl TokenizerBuilder {
self
}
/// Extend accepted leading characters for Identifiers.
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
self.leading_chars = chars;
self
}
/// Specify that digits can be included in Identifiers.
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
self.include_digits = yes;
self
}
/// Extend accepted characters for Identifiers.
pub fn include_chars(&mut self, chars: String) -> &mut Self {
self.include_chars = chars;
self
}
pub fn build(&self) -> Tokenizer {
let mut pattern = r#"\b("#.to_owned();
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
pattern.push_str(r#"*)\b"#);
let words_str = regex::Regex::new(&pattern).unwrap();
let TokenizerBuilder {
leading_digits,
ignore_hex,
} = self.clone();
Tokenizer {
words_str,
leading_digits: self.leading_digits,
ignore_hex: self.ignore_hex,
leading_digits,
ignore_hex,
}
}
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
pattern.push_str(r#"(\p{Alphabetic}"#);
if digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push(')');
}
}
impl Default for TokenizerBuilder {
fn default() -> Self {
Self {
ignore_hex: true,
leading_digits: false,
leading_chars: "_".to_owned(),
include_digits: true,
include_chars: "_'".to_owned(),
ignore_hex: true,
}
}
}
@ -86,7 +46,6 @@ impl Default for TokenizerBuilder {
/// Extract Identifiers from a buffer.
#[derive(Debug, Clone)]
pub struct Tokenizer {
words_str: regex::Regex,
leading_digits: bool,
ignore_hex: bool,
}
@ -97,10 +56,15 @@ impl Tokenizer {
}
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
self.words_str
.find_iter(content)
.filter(move |m| self.accept(m.as_str()))
.map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start()))
parser::iter_literals(content).filter_map(move |identifier| {
let case = Case::None;
let offset = offset(content.as_bytes(), identifier.as_bytes());
if self.accept(identifier) {
Some(Identifier::new_unchecked(identifier, case, offset))
} else {
None
}
})
}
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
@ -216,6 +180,39 @@ fn is_hex_digit(chr: u8) -> bool {
chr.is_ascii_hexdigit()
}
mod parser {
use nom::bytes::complete::*;
use nom::sequence::*;
use nom::IResult;
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) {
Ok((i, o)) => {
input = i;
assert_ne!(o, "");
Some(o)
}
_ => None,
})
}
fn next_literal(input: &str) -> IResult<&str, &str> {
preceded(literal_sep, identifier)(input)
}
fn literal_sep(input: &str) -> IResult<&str, &str> {
take_till(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input)
}
fn identifier(input: &str) -> IResult<&str, &str> {
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse
take_while1(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input)
}
}
/// A term composed of Words.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Identifier<'t> {

View file

@ -25,9 +25,6 @@ Configuration is read from the following (in precedence order)
| default.check-file | \- | bool | Verifying spelling in files. |
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. |
| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. |
| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. |
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |

View file

@ -234,12 +234,6 @@ pub struct TokenizerConfig {
pub ignore_hex: Option<bool>,
/// Allow identifiers to start with digits, in addition to letters.
pub identifier_leading_digits: Option<bool>,
/// Allow identifiers to start with one of these characters.
pub identifier_leading_chars: Option<kstring::KString>,
/// Allow identifiers to include digits, in addition to letters.
pub identifier_include_digits: Option<bool>,
/// Allow identifiers to include these characters.
pub identifier_include_chars: Option<kstring::KString>,
}
impl TokenizerConfig {
@ -248,13 +242,6 @@ impl TokenizerConfig {
Self {
ignore_hex: Some(empty.ignore_hex()),
identifier_leading_digits: Some(empty.identifier_leading_digits()),
identifier_leading_chars: Some(kstring::KString::from_ref(
empty.identifier_leading_chars(),
)),
identifier_include_digits: Some(empty.identifier_include_digits()),
identifier_include_chars: Some(kstring::KString::from_ref(
empty.identifier_include_chars(),
)),
}
}
@ -265,15 +252,6 @@ impl TokenizerConfig {
if let Some(source) = source.identifier_leading_digits {
self.identifier_leading_digits = Some(source);
}
if let Some(source) = source.identifier_leading_chars.as_ref() {
self.identifier_leading_chars = Some(source.clone());
}
if let Some(source) = source.identifier_include_digits {
self.identifier_include_digits = Some(source);
}
if let Some(source) = source.identifier_include_chars.as_ref() {
self.identifier_include_chars = Some(source.clone());
}
}
pub fn ignore_hex(&self) -> bool {
@ -283,18 +261,6 @@ impl TokenizerConfig {
pub fn identifier_leading_digits(&self) -> bool {
self.identifier_leading_digits.unwrap_or(false)
}
pub fn identifier_leading_chars(&self) -> &str {
self.identifier_leading_chars.as_deref().unwrap_or("_")
}
pub fn identifier_include_digits(&self) -> bool {
self.identifier_include_digits.unwrap_or(true)
}
pub fn identifier_include_chars(&self) -> &str {
self.identifier_include_chars.as_deref().unwrap_or("_'")
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]

View file

@ -219,9 +219,6 @@ impl<'s> ConfigEngine<'s> {
let tokenizer = typos::tokens::TokenizerBuilder::new()
.ignore_hex(tokenizer_config.ignore_hex())
.leading_digits(tokenizer_config.identifier_leading_digits())
.leading_chars(tokenizer_config.identifier_leading_chars().to_owned())
.include_digits(tokenizer_config.identifier_include_digits())
.include_chars(tokenizer_config.identifier_include_chars().to_owned())
.build();
let dict = crate::dict::BuiltIn::new(dict_config.locale());