Merge pull request #66 from epage/digits

perf: Use standard identifier rules to avoid doing umber checks
This commit is contained in:
Ed Page 2019-11-02 19:55:34 -06:00 committed by GitHub
commit 15210c928c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 27 deletions

View file

@ -52,32 +52,32 @@ fn bench_split_lines(data: &str, b: &mut test::Bencher) {
} }
#[bench] #[bench]
fn parse_words_lines_empty(b: &mut test::Bencher) { fn parse_lines_empty(b: &mut test::Bencher) {
bench_split_lines(data::EMPTY, b); bench_split_lines(data::EMPTY, b);
} }
#[bench] #[bench]
fn parse_words_lines_no_tokens(b: &mut test::Bencher) { fn parse_lines_no_tokens(b: &mut test::Bencher) {
bench_split_lines(data::NO_TOKENS, b); bench_split_lines(data::NO_TOKENS, b);
} }
#[bench] #[bench]
fn parse_words_lines_single_token(b: &mut test::Bencher) { fn parse_lines_single_token(b: &mut test::Bencher) {
bench_split_lines(data::SINGLE_TOKEN, b); bench_split_lines(data::SINGLE_TOKEN, b);
} }
#[bench] #[bench]
fn parse_words_lines_sherlock(b: &mut test::Bencher) { fn parse_lines_sherlock(b: &mut test::Bencher) {
bench_split_lines(data::SHERLOCK, b); bench_split_lines(data::SHERLOCK, b);
} }
#[bench] #[bench]
fn parse_words_lines_code(b: &mut test::Bencher) { fn parse_lines_code(b: &mut test::Bencher) {
bench_split_lines(data::CODE, b); bench_split_lines(data::CODE, b);
} }
#[bench] #[bench]
fn parse_words_lines_corpus(b: &mut test::Bencher) { fn parse_lines_corpus(b: &mut test::Bencher) {
bench_split_lines(data::CORPUS, b); bench_split_lines(data::CORPUS, b);
} }

View file

@ -53,22 +53,32 @@ pub trait FileSource {
None None
} }
/// Verifying spelling in filess. /// Verifying spelling in files.
fn check_file(&self) -> Option<bool> { fn check_file(&self) -> Option<bool> {
None None
} }
/// Do not check identifiers that appear to be hexadecimal values /// Do not check identifiers that appear to be hexadecimal values.
fn ignore_hex(&self) -> Option<bool> { fn ignore_hex(&self) -> Option<bool> {
None None
} }
/// Allow identifiers to include digits, in addition to letters /// Allow identifiers to start with digits, in addition to letters.
fn identifier_leading_digits(&self) -> Option<bool> {
None
}
/// Allow identifiers to start with one of these characters.
fn identifier_leading_chars(&self) -> Option<&str> {
None
}
/// Allow identifiers to include digits, in addition to letters.
fn identifier_include_digits(&self) -> Option<bool> { fn identifier_include_digits(&self) -> Option<bool> {
None None
} }
/// Specify additional characters to be included in identifiers /// Allow identifiers to include these characters.
fn identifier_include_chars(&self) -> Option<&str> { fn identifier_include_chars(&self) -> Option<&str> {
None None
} }
@ -233,6 +243,8 @@ pub struct FileConfig {
pub check_filename: Option<bool>, pub check_filename: Option<bool>,
pub check_file: Option<bool>, pub check_file: Option<bool>,
pub ignore_hex: Option<bool>, pub ignore_hex: Option<bool>,
pub identifier_leading_digits: Option<bool>,
pub identifier_leading_chars: Option<String>,
pub identifier_include_digits: Option<bool>, pub identifier_include_digits: Option<bool>,
pub identifier_include_chars: Option<String>, pub identifier_include_chars: Option<String>,
} }
@ -248,6 +260,12 @@ impl FileConfig {
if let Some(source) = source.ignore_hex() { if let Some(source) = source.ignore_hex() {
self.ignore_hex = Some(source); self.ignore_hex = Some(source);
} }
if let Some(source) = source.identifier_leading_digits() {
self.identifier_leading_digits = Some(source);
}
if let Some(source) = source.identifier_leading_chars() {
self.identifier_leading_chars = Some(source.to_owned());
}
if let Some(source) = source.identifier_include_digits() { if let Some(source) = source.identifier_include_digits() {
self.identifier_include_digits = Some(source); self.identifier_include_digits = Some(source);
} }
@ -268,6 +286,17 @@ impl FileConfig {
self.ignore_hex.unwrap_or(true) self.ignore_hex.unwrap_or(true)
} }
pub fn identifier_leading_digits(&self) -> bool {
self.identifier_leading_digits.unwrap_or(false)
}
pub fn identifier_leading_chars(&self) -> &str {
self.identifier_leading_chars
.as_ref()
.map(|s| s.as_str())
.unwrap_or("_")
}
pub fn identifier_include_digits(&self) -> bool { pub fn identifier_include_digits(&self) -> bool {
self.identifier_include_digits.unwrap_or(true) self.identifier_include_digits.unwrap_or(true)
} }
@ -293,6 +322,14 @@ impl FileSource for FileConfig {
self.ignore_hex self.ignore_hex
} }
fn identifier_leading_digits(&self) -> Option<bool> {
self.identifier_leading_digits
}
fn identifier_leading_chars(&self) -> Option<&str> {
self.identifier_leading_chars.as_ref().map(|s| s.as_str())
}
fn identifier_include_digits(&self) -> Option<bool> { fn identifier_include_digits(&self) -> Option<bool> {
self.identifier_include_digits self.identifier_include_digits
} }

View file

@ -412,6 +412,8 @@ fn run() -> Result<i32, anyhow::Error> {
let parser = typos::tokens::ParserBuilder::new() let parser = typos::tokens::ParserBuilder::new()
.ignore_hex(config.default.ignore_hex()) .ignore_hex(config.default.ignore_hex())
.leading_digits(config.default.identifier_leading_digits())
.leading_chars(config.default.identifier_leading_chars().to_owned())
.include_digits(config.default.identifier_include_digits()) .include_digits(config.default.identifier_include_digits())
.include_chars(config.default.identifier_include_chars().to_owned()) .include_chars(config.default.identifier_include_chars().to_owned())
.build(); .build();

View file

@ -9,6 +9,8 @@ pub enum Case {
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ParserBuilder { pub struct ParserBuilder {
ignore_hex: bool, ignore_hex: bool,
leading_digits: bool,
leading_chars: String,
include_digits: bool, include_digits: bool,
include_chars: String, include_chars: String,
} }
@ -23,6 +25,16 @@ impl ParserBuilder {
self self
} }
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
self.leading_digits = yes;
self
}
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
self.leading_chars = chars;
self
}
pub fn include_digits(&mut self, yes: bool) -> &mut Self { pub fn include_digits(&mut self, yes: bool) -> &mut Self {
self.include_digits = yes; self.include_digits = yes;
self self
@ -34,31 +46,44 @@ impl ParserBuilder {
} }
pub fn build(&self) -> Parser { pub fn build(&self) -> Parser {
let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned(); let mut pattern = r#"\b("#.to_owned();
if self.include_digits { Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
pattern.push_str(r#"|\d"#); Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
} pattern.push_str(r#"*)\b"#);
for grapheme in let pattern = dbg!(pattern);
unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
{
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")+\b"#);
let words_str = regex::Regex::new(&pattern).unwrap(); let words_str = regex::Regex::new(&pattern).unwrap();
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap(); let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
Parser { Parser {
words_str, words_str,
words_bytes, words_bytes,
ignore_hex: self.ignore_hex && self.include_digits, // `leading_digits` let's us bypass the regexes since you can't have a decimal or
// hexadecimal number without a leading digit.
ignore_numbers: self.leading_digits,
ignore_hex: self.ignore_hex && self.leading_digits,
} }
} }
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
pattern.push_str(r#"(\p{Alphabetic}"#);
if digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")"#);
}
} }
impl Default for ParserBuilder { impl Default for ParserBuilder {
fn default() -> Self { fn default() -> Self {
Self { Self {
ignore_hex: true, ignore_hex: true,
leading_digits: false,
leading_chars: "_".to_owned(),
include_digits: true, include_digits: true,
include_chars: "_'".to_owned(), include_chars: "_'".to_owned(),
} }
@ -69,6 +94,7 @@ impl Default for ParserBuilder {
pub struct Parser { pub struct Parser {
words_str: regex::Regex, words_str: regex::Regex,
words_bytes: regex::bytes::Regex, words_bytes: regex::bytes::Regex,
ignore_numbers: bool,
ignore_hex: bool, ignore_hex: bool,
} }
@ -95,12 +121,12 @@ impl Parser {
} }
fn accept(&self, contents: &[u8]) -> bool { fn accept(&self, contents: &[u8]) -> bool {
if is_number(contents) { if self.ignore_numbers && is_number(contents) {
return false; return false;
}; }
if self.ignore_hex { if self.ignore_hex && is_hex(contents) {
return !is_hex(contents); return false;
} }
true true
@ -455,7 +481,10 @@ mod test {
#[test] #[test]
fn tokenize_ignore_hex_disabled() { fn tokenize_ignore_hex_disabled() {
let parser = ParserBuilder::new().ignore_hex(false).build(); let parser = ParserBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World"; let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![