Merge pull request #66 from epage/digits

perf: Use standard identifier rules to avoid doing umber checks
This commit is contained in:
Ed Page 2019-11-02 19:55:34 -06:00 committed by GitHub
commit 15210c928c
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 27 deletions

View file

@ -52,32 +52,32 @@ fn bench_split_lines(data: &str, b: &mut test::Bencher) {
}
#[bench]
fn parse_words_lines_empty(b: &mut test::Bencher) {
fn parse_lines_empty(b: &mut test::Bencher) {
bench_split_lines(data::EMPTY, b);
}
#[bench]
fn parse_words_lines_no_tokens(b: &mut test::Bencher) {
fn parse_lines_no_tokens(b: &mut test::Bencher) {
bench_split_lines(data::NO_TOKENS, b);
}
#[bench]
fn parse_words_lines_single_token(b: &mut test::Bencher) {
fn parse_lines_single_token(b: &mut test::Bencher) {
bench_split_lines(data::SINGLE_TOKEN, b);
}
#[bench]
fn parse_words_lines_sherlock(b: &mut test::Bencher) {
fn parse_lines_sherlock(b: &mut test::Bencher) {
bench_split_lines(data::SHERLOCK, b);
}
#[bench]
fn parse_words_lines_code(b: &mut test::Bencher) {
fn parse_lines_code(b: &mut test::Bencher) {
bench_split_lines(data::CODE, b);
}
#[bench]
fn parse_words_lines_corpus(b: &mut test::Bencher) {
fn parse_lines_corpus(b: &mut test::Bencher) {
bench_split_lines(data::CORPUS, b);
}

View file

@ -53,22 +53,32 @@ pub trait FileSource {
None
}
/// Verifying spelling in filess.
/// Verifying spelling in files.
fn check_file(&self) -> Option<bool> {
None
}
/// Do not check identifiers that appear to be hexadecimal values
/// Do not check identifiers that appear to be hexadecimal values.
fn ignore_hex(&self) -> Option<bool> {
None
}
/// Allow identifiers to include digits, in addition to letters
/// Allow identifiers to start with digits, in addition to letters.
fn identifier_leading_digits(&self) -> Option<bool> {
None
}
/// Allow identifiers to start with one of these characters.
fn identifier_leading_chars(&self) -> Option<&str> {
None
}
/// Allow identifiers to include digits, in addition to letters.
fn identifier_include_digits(&self) -> Option<bool> {
None
}
/// Specify additional characters to be included in identifiers
/// Allow identifiers to include these characters.
fn identifier_include_chars(&self) -> Option<&str> {
None
}
@ -233,6 +243,8 @@ pub struct FileConfig {
pub check_filename: Option<bool>,
pub check_file: Option<bool>,
pub ignore_hex: Option<bool>,
pub identifier_leading_digits: Option<bool>,
pub identifier_leading_chars: Option<String>,
pub identifier_include_digits: Option<bool>,
pub identifier_include_chars: Option<String>,
}
@ -248,6 +260,12 @@ impl FileConfig {
if let Some(source) = source.ignore_hex() {
self.ignore_hex = Some(source);
}
if let Some(source) = source.identifier_leading_digits() {
self.identifier_leading_digits = Some(source);
}
if let Some(source) = source.identifier_leading_chars() {
self.identifier_leading_chars = Some(source.to_owned());
}
if let Some(source) = source.identifier_include_digits() {
self.identifier_include_digits = Some(source);
}
@ -268,6 +286,17 @@ impl FileConfig {
self.ignore_hex.unwrap_or(true)
}
pub fn identifier_leading_digits(&self) -> bool {
self.identifier_leading_digits.unwrap_or(false)
}
pub fn identifier_leading_chars(&self) -> &str {
self.identifier_leading_chars
.as_ref()
.map(|s| s.as_str())
.unwrap_or("_")
}
pub fn identifier_include_digits(&self) -> bool {
self.identifier_include_digits.unwrap_or(true)
}
@ -293,6 +322,14 @@ impl FileSource for FileConfig {
self.ignore_hex
}
fn identifier_leading_digits(&self) -> Option<bool> {
self.identifier_leading_digits
}
fn identifier_leading_chars(&self) -> Option<&str> {
self.identifier_leading_chars.as_ref().map(|s| s.as_str())
}
fn identifier_include_digits(&self) -> Option<bool> {
self.identifier_include_digits
}

View file

@ -412,6 +412,8 @@ fn run() -> Result<i32, anyhow::Error> {
let parser = typos::tokens::ParserBuilder::new()
.ignore_hex(config.default.ignore_hex())
.leading_digits(config.default.identifier_leading_digits())
.leading_chars(config.default.identifier_leading_chars().to_owned())
.include_digits(config.default.identifier_include_digits())
.include_chars(config.default.identifier_include_chars().to_owned())
.build();

View file

@ -9,6 +9,8 @@ pub enum Case {
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ParserBuilder {
ignore_hex: bool,
leading_digits: bool,
leading_chars: String,
include_digits: bool,
include_chars: String,
}
@ -23,6 +25,16 @@ impl ParserBuilder {
self
}
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
self.leading_digits = yes;
self
}
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
self.leading_chars = chars;
self
}
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
self.include_digits = yes;
self
@ -34,31 +46,44 @@ impl ParserBuilder {
}
pub fn build(&self) -> Parser {
let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned();
if self.include_digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in
unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
{
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")+\b"#);
let mut pattern = r#"\b("#.to_owned();
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
pattern.push_str(r#"*)\b"#);
let pattern = dbg!(pattern);
let words_str = regex::Regex::new(&pattern).unwrap();
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
Parser {
words_str,
words_bytes,
ignore_hex: self.ignore_hex && self.include_digits,
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
// hexadecimal number without a leading digit.
ignore_numbers: self.leading_digits,
ignore_hex: self.ignore_hex && self.leading_digits,
}
}
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
pattern.push_str(r#"(\p{Alphabetic}"#);
if digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")"#);
}
}
impl Default for ParserBuilder {
fn default() -> Self {
Self {
ignore_hex: true,
leading_digits: false,
leading_chars: "_".to_owned(),
include_digits: true,
include_chars: "_'".to_owned(),
}
@ -69,6 +94,7 @@ impl Default for ParserBuilder {
pub struct Parser {
words_str: regex::Regex,
words_bytes: regex::bytes::Regex,
ignore_numbers: bool,
ignore_hex: bool,
}
@ -95,12 +121,12 @@ impl Parser {
}
fn accept(&self, contents: &[u8]) -> bool {
if is_number(contents) {
if self.ignore_numbers && is_number(contents) {
return false;
};
}
if self.ignore_hex {
return !is_hex(contents);
if self.ignore_hex && is_hex(contents) {
return false;
}
true
@ -455,7 +481,10 @@ mod test {
#[test]
fn tokenize_ignore_hex_disabled() {
let parser = ParserBuilder::new().ignore_hex(false).build();
let parser = ParserBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![