mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-21 16:41:01 -05:00
Merge pull request #66 from epage/digits
perf: Use standard identifier rules to avoid doing umber checks
This commit is contained in:
commit
15210c928c
4 changed files with 95 additions and 27 deletions
|
@ -52,32 +52,32 @@ fn bench_split_lines(data: &str, b: &mut test::Bencher) {
|
|||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_words_lines_empty(b: &mut test::Bencher) {
|
||||
fn parse_lines_empty(b: &mut test::Bencher) {
|
||||
bench_split_lines(data::EMPTY, b);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_words_lines_no_tokens(b: &mut test::Bencher) {
|
||||
fn parse_lines_no_tokens(b: &mut test::Bencher) {
|
||||
bench_split_lines(data::NO_TOKENS, b);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_words_lines_single_token(b: &mut test::Bencher) {
|
||||
fn parse_lines_single_token(b: &mut test::Bencher) {
|
||||
bench_split_lines(data::SINGLE_TOKEN, b);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_words_lines_sherlock(b: &mut test::Bencher) {
|
||||
fn parse_lines_sherlock(b: &mut test::Bencher) {
|
||||
bench_split_lines(data::SHERLOCK, b);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_words_lines_code(b: &mut test::Bencher) {
|
||||
fn parse_lines_code(b: &mut test::Bencher) {
|
||||
bench_split_lines(data::CODE, b);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_words_lines_corpus(b: &mut test::Bencher) {
|
||||
fn parse_lines_corpus(b: &mut test::Bencher) {
|
||||
bench_split_lines(data::CORPUS, b);
|
||||
}
|
||||
|
||||
|
|
|
@ -53,22 +53,32 @@ pub trait FileSource {
|
|||
None
|
||||
}
|
||||
|
||||
/// Verifying spelling in filess.
|
||||
/// Verifying spelling in files.
|
||||
fn check_file(&self) -> Option<bool> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Do not check identifiers that appear to be hexadecimal values
|
||||
/// Do not check identifiers that appear to be hexadecimal values.
|
||||
fn ignore_hex(&self) -> Option<bool> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Allow identifiers to include digits, in addition to letters
|
||||
/// Allow identifiers to start with digits, in addition to letters.
|
||||
fn identifier_leading_digits(&self) -> Option<bool> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Allow identifiers to start with one of these characters.
|
||||
fn identifier_leading_chars(&self) -> Option<&str> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Allow identifiers to include digits, in addition to letters.
|
||||
fn identifier_include_digits(&self) -> Option<bool> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Specify additional characters to be included in identifiers
|
||||
/// Allow identifiers to include these characters.
|
||||
fn identifier_include_chars(&self) -> Option<&str> {
|
||||
None
|
||||
}
|
||||
|
@ -233,6 +243,8 @@ pub struct FileConfig {
|
|||
pub check_filename: Option<bool>,
|
||||
pub check_file: Option<bool>,
|
||||
pub ignore_hex: Option<bool>,
|
||||
pub identifier_leading_digits: Option<bool>,
|
||||
pub identifier_leading_chars: Option<String>,
|
||||
pub identifier_include_digits: Option<bool>,
|
||||
pub identifier_include_chars: Option<String>,
|
||||
}
|
||||
|
@ -248,6 +260,12 @@ impl FileConfig {
|
|||
if let Some(source) = source.ignore_hex() {
|
||||
self.ignore_hex = Some(source);
|
||||
}
|
||||
if let Some(source) = source.identifier_leading_digits() {
|
||||
self.identifier_leading_digits = Some(source);
|
||||
}
|
||||
if let Some(source) = source.identifier_leading_chars() {
|
||||
self.identifier_leading_chars = Some(source.to_owned());
|
||||
}
|
||||
if let Some(source) = source.identifier_include_digits() {
|
||||
self.identifier_include_digits = Some(source);
|
||||
}
|
||||
|
@ -268,6 +286,17 @@ impl FileConfig {
|
|||
self.ignore_hex.unwrap_or(true)
|
||||
}
|
||||
|
||||
pub fn identifier_leading_digits(&self) -> bool {
|
||||
self.identifier_leading_digits.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn identifier_leading_chars(&self) -> &str {
|
||||
self.identifier_leading_chars
|
||||
.as_ref()
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or("_")
|
||||
}
|
||||
|
||||
pub fn identifier_include_digits(&self) -> bool {
|
||||
self.identifier_include_digits.unwrap_or(true)
|
||||
}
|
||||
|
@ -293,6 +322,14 @@ impl FileSource for FileConfig {
|
|||
self.ignore_hex
|
||||
}
|
||||
|
||||
fn identifier_leading_digits(&self) -> Option<bool> {
|
||||
self.identifier_leading_digits
|
||||
}
|
||||
|
||||
fn identifier_leading_chars(&self) -> Option<&str> {
|
||||
self.identifier_leading_chars.as_ref().map(|s| s.as_str())
|
||||
}
|
||||
|
||||
fn identifier_include_digits(&self) -> Option<bool> {
|
||||
self.identifier_include_digits
|
||||
}
|
||||
|
|
|
@ -412,6 +412,8 @@ fn run() -> Result<i32, anyhow::Error> {
|
|||
|
||||
let parser = typos::tokens::ParserBuilder::new()
|
||||
.ignore_hex(config.default.ignore_hex())
|
||||
.leading_digits(config.default.identifier_leading_digits())
|
||||
.leading_chars(config.default.identifier_leading_chars().to_owned())
|
||||
.include_digits(config.default.identifier_include_digits())
|
||||
.include_chars(config.default.identifier_include_chars().to_owned())
|
||||
.build();
|
||||
|
|
|
@ -9,6 +9,8 @@ pub enum Case {
|
|||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct ParserBuilder {
|
||||
ignore_hex: bool,
|
||||
leading_digits: bool,
|
||||
leading_chars: String,
|
||||
include_digits: bool,
|
||||
include_chars: String,
|
||||
}
|
||||
|
@ -23,6 +25,16 @@ impl ParserBuilder {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
|
||||
self.leading_digits = yes;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
|
||||
self.leading_chars = chars;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
|
||||
self.include_digits = yes;
|
||||
self
|
||||
|
@ -34,31 +46,44 @@ impl ParserBuilder {
|
|||
}
|
||||
|
||||
pub fn build(&self) -> Parser {
|
||||
let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned();
|
||||
if self.include_digits {
|
||||
pattern.push_str(r#"|\d"#);
|
||||
}
|
||||
for grapheme in
|
||||
unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
|
||||
{
|
||||
let escaped = regex::escape(&grapheme);
|
||||
pattern.push_str(&format!("|{}", escaped));
|
||||
}
|
||||
pattern.push_str(r#")+\b"#);
|
||||
let mut pattern = r#"\b("#.to_owned();
|
||||
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
|
||||
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
|
||||
pattern.push_str(r#"*)\b"#);
|
||||
let pattern = dbg!(pattern);
|
||||
|
||||
let words_str = regex::Regex::new(&pattern).unwrap();
|
||||
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
|
||||
|
||||
Parser {
|
||||
words_str,
|
||||
words_bytes,
|
||||
ignore_hex: self.ignore_hex && self.include_digits,
|
||||
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
|
||||
// hexadecimal number without a leading digit.
|
||||
ignore_numbers: self.leading_digits,
|
||||
ignore_hex: self.ignore_hex && self.leading_digits,
|
||||
}
|
||||
}
|
||||
|
||||
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
|
||||
pattern.push_str(r#"(\p{Alphabetic}"#);
|
||||
if digits {
|
||||
pattern.push_str(r#"|\d"#);
|
||||
}
|
||||
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
|
||||
let escaped = regex::escape(&grapheme);
|
||||
pattern.push_str(&format!("|{}", escaped));
|
||||
}
|
||||
pattern.push_str(r#")"#);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ParserBuilder {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
ignore_hex: true,
|
||||
leading_digits: false,
|
||||
leading_chars: "_".to_owned(),
|
||||
include_digits: true,
|
||||
include_chars: "_'".to_owned(),
|
||||
}
|
||||
|
@ -69,6 +94,7 @@ impl Default for ParserBuilder {
|
|||
pub struct Parser {
|
||||
words_str: regex::Regex,
|
||||
words_bytes: regex::bytes::Regex,
|
||||
ignore_numbers: bool,
|
||||
ignore_hex: bool,
|
||||
}
|
||||
|
||||
|
@ -95,12 +121,12 @@ impl Parser {
|
|||
}
|
||||
|
||||
fn accept(&self, contents: &[u8]) -> bool {
|
||||
if is_number(contents) {
|
||||
if self.ignore_numbers && is_number(contents) {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
if self.ignore_hex {
|
||||
return !is_hex(contents);
|
||||
if self.ignore_hex && is_hex(contents) {
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
|
@ -455,7 +481,10 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn tokenize_ignore_hex_disabled() {
|
||||
let parser = ParserBuilder::new().ignore_hex(false).build();
|
||||
let parser = ParserBuilder::new()
|
||||
.ignore_hex(false)
|
||||
.leading_digits(true)
|
||||
.build();
|
||||
|
||||
let input = "Hello 0xDEADBEEF World";
|
||||
let expected: Vec<Identifier> = vec![
|
||||
|
|
Loading…
Reference in a new issue