mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-24 18:10:56 -05:00
fix(parser)!: Defer to Unicode XID for identifiers
This saves us from having to have configuration for every detail. If people need more control, we can offer it later. Fixes #225
This commit is contained in:
parent
f15cc58f71
commit
9cbc7410a4
6 changed files with 57 additions and 98 deletions
11
Cargo.lock
generated
11
Cargo.lock
generated
|
@ -1032,7 +1032,7 @@ version = "1.0.24"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
|
||||
dependencies = [
|
||||
"unicode-xid 0.2.1",
|
||||
"unicode-xid 0.2.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1391,7 +1391,7 @@ checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
|
|||
dependencies = [
|
||||
"proc-macro2 1.0.24",
|
||||
"quote 1.0.9",
|
||||
"unicode-xid 0.2.1",
|
||||
"unicode-xid 0.2.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1499,12 +1499,13 @@ dependencies = [
|
|||
"anyhow",
|
||||
"itertools 0.10.0",
|
||||
"log",
|
||||
"nom",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
"simdutf8",
|
||||
"thiserror",
|
||||
"unicode-segmentation",
|
||||
"unicode-xid 0.2.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1638,9 +1639,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
|||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
||||
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
|
|
|
@ -17,7 +17,8 @@ codecov = { repository = "crate-ci/typos" }
|
|||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
thiserror = "1.0"
|
||||
regex = "1.3"
|
||||
nom = "6.0"
|
||||
unicode-xid = "0.2.2"
|
||||
once_cell = "1.2.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
simdutf8 = "0.1.1"
|
||||
|
|
|
@ -3,9 +3,6 @@
|
|||
pub struct TokenizerBuilder {
|
||||
ignore_hex: bool,
|
||||
leading_digits: bool,
|
||||
leading_chars: String,
|
||||
include_digits: bool,
|
||||
include_chars: String,
|
||||
}
|
||||
|
||||
impl TokenizerBuilder {
|
||||
|
@ -25,60 +22,23 @@ impl TokenizerBuilder {
|
|||
self
|
||||
}
|
||||
|
||||
/// Extend accepted leading characters for Identifiers.
|
||||
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
|
||||
self.leading_chars = chars;
|
||||
self
|
||||
}
|
||||
|
||||
/// Specify that digits can be included in Identifiers.
|
||||
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
|
||||
self.include_digits = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Extend accepted characters for Identifiers.
|
||||
pub fn include_chars(&mut self, chars: String) -> &mut Self {
|
||||
self.include_chars = chars;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(&self) -> Tokenizer {
|
||||
let mut pattern = r#"\b("#.to_owned();
|
||||
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
|
||||
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
|
||||
pattern.push_str(r#"*)\b"#);
|
||||
|
||||
let words_str = regex::Regex::new(&pattern).unwrap();
|
||||
|
||||
let TokenizerBuilder {
|
||||
leading_digits,
|
||||
ignore_hex,
|
||||
} = self.clone();
|
||||
Tokenizer {
|
||||
words_str,
|
||||
leading_digits: self.leading_digits,
|
||||
ignore_hex: self.ignore_hex,
|
||||
leading_digits,
|
||||
ignore_hex,
|
||||
}
|
||||
}
|
||||
|
||||
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
|
||||
pattern.push_str(r#"(\p{Alphabetic}"#);
|
||||
if digits {
|
||||
pattern.push_str(r#"|\d"#);
|
||||
}
|
||||
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
|
||||
let escaped = regex::escape(&grapheme);
|
||||
pattern.push_str(&format!("|{}", escaped));
|
||||
}
|
||||
pattern.push(')');
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TokenizerBuilder {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
ignore_hex: true,
|
||||
leading_digits: false,
|
||||
leading_chars: "_".to_owned(),
|
||||
include_digits: true,
|
||||
include_chars: "_'".to_owned(),
|
||||
ignore_hex: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -86,7 +46,6 @@ impl Default for TokenizerBuilder {
|
|||
/// Extract Identifiers from a buffer.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Tokenizer {
|
||||
words_str: regex::Regex,
|
||||
leading_digits: bool,
|
||||
ignore_hex: bool,
|
||||
}
|
||||
|
@ -97,10 +56,15 @@ impl Tokenizer {
|
|||
}
|
||||
|
||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||
self.words_str
|
||||
.find_iter(content)
|
||||
.filter(move |m| self.accept(m.as_str()))
|
||||
.map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start()))
|
||||
parser::iter_literals(content).filter_map(move |identifier| {
|
||||
let case = Case::None;
|
||||
let offset = offset(content.as_bytes(), identifier.as_bytes());
|
||||
if self.accept(identifier) {
|
||||
Some(Identifier::new_unchecked(identifier, case, offset))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||
|
@ -216,6 +180,39 @@ fn is_hex_digit(chr: u8) -> bool {
|
|||
chr.is_ascii_hexdigit()
|
||||
}
|
||||
|
||||
mod parser {
|
||||
use nom::bytes::complete::*;
|
||||
use nom::sequence::*;
|
||||
use nom::IResult;
|
||||
|
||||
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
|
||||
std::iter::from_fn(move || match next_literal(input) {
|
||||
Ok((i, o)) => {
|
||||
input = i;
|
||||
assert_ne!(o, "");
|
||||
Some(o)
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
fn next_literal(input: &str) -> IResult<&str, &str> {
|
||||
preceded(literal_sep, identifier)(input)
|
||||
}
|
||||
|
||||
fn literal_sep(input: &str) -> IResult<&str, &str> {
|
||||
take_till(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input)
|
||||
}
|
||||
|
||||
fn identifier(input: &str) -> IResult<&str, &str> {
|
||||
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
|
||||
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
|
||||
// or unexpected cases than strip off start characters to a word since we aren't doing a
|
||||
// proper word boundary parse
|
||||
take_while1(|c: char| unicode_xid::UnicodeXID::is_xid_continue(c))(input)
|
||||
}
|
||||
}
|
||||
|
||||
/// A term composed of Words.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Identifier<'t> {
|
||||
|
|
|
@ -25,9 +25,6 @@ Configuration is read from the following (in precedence order)
|
|||
| default.check-file | \- | bool | Verifying spelling in files. |
|
||||
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
|
||||
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
|
||||
| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. |
|
||||
| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. |
|
||||
| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. |
|
||||
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
||||
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
||||
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
||||
|
|
|
@ -234,12 +234,6 @@ pub struct TokenizerConfig {
|
|||
pub ignore_hex: Option<bool>,
|
||||
/// Allow identifiers to start with digits, in addition to letters.
|
||||
pub identifier_leading_digits: Option<bool>,
|
||||
/// Allow identifiers to start with one of these characters.
|
||||
pub identifier_leading_chars: Option<kstring::KString>,
|
||||
/// Allow identifiers to include digits, in addition to letters.
|
||||
pub identifier_include_digits: Option<bool>,
|
||||
/// Allow identifiers to include these characters.
|
||||
pub identifier_include_chars: Option<kstring::KString>,
|
||||
}
|
||||
|
||||
impl TokenizerConfig {
|
||||
|
@ -248,13 +242,6 @@ impl TokenizerConfig {
|
|||
Self {
|
||||
ignore_hex: Some(empty.ignore_hex()),
|
||||
identifier_leading_digits: Some(empty.identifier_leading_digits()),
|
||||
identifier_leading_chars: Some(kstring::KString::from_ref(
|
||||
empty.identifier_leading_chars(),
|
||||
)),
|
||||
identifier_include_digits: Some(empty.identifier_include_digits()),
|
||||
identifier_include_chars: Some(kstring::KString::from_ref(
|
||||
empty.identifier_include_chars(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -265,15 +252,6 @@ impl TokenizerConfig {
|
|||
if let Some(source) = source.identifier_leading_digits {
|
||||
self.identifier_leading_digits = Some(source);
|
||||
}
|
||||
if let Some(source) = source.identifier_leading_chars.as_ref() {
|
||||
self.identifier_leading_chars = Some(source.clone());
|
||||
}
|
||||
if let Some(source) = source.identifier_include_digits {
|
||||
self.identifier_include_digits = Some(source);
|
||||
}
|
||||
if let Some(source) = source.identifier_include_chars.as_ref() {
|
||||
self.identifier_include_chars = Some(source.clone());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ignore_hex(&self) -> bool {
|
||||
|
@ -283,18 +261,6 @@ impl TokenizerConfig {
|
|||
pub fn identifier_leading_digits(&self) -> bool {
|
||||
self.identifier_leading_digits.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn identifier_leading_chars(&self) -> &str {
|
||||
self.identifier_leading_chars.as_deref().unwrap_or("_")
|
||||
}
|
||||
|
||||
pub fn identifier_include_digits(&self) -> bool {
|
||||
self.identifier_include_digits.unwrap_or(true)
|
||||
}
|
||||
|
||||
pub fn identifier_include_chars(&self) -> &str {
|
||||
self.identifier_include_chars.as_deref().unwrap_or("_'")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
|
|
|
@ -219,9 +219,6 @@ impl<'s> ConfigEngine<'s> {
|
|||
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
||||
.ignore_hex(tokenizer_config.ignore_hex())
|
||||
.leading_digits(tokenizer_config.identifier_leading_digits())
|
||||
.leading_chars(tokenizer_config.identifier_leading_chars().to_owned())
|
||||
.include_digits(tokenizer_config.identifier_include_digits())
|
||||
.include_chars(tokenizer_config.identifier_include_chars().to_owned())
|
||||
.build();
|
||||
|
||||
let dict = crate::dict::BuiltIn::new(dict_config.locale());
|
||||
|
|
Loading…
Reference in a new issue