diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 75b1b9c..dea743a 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -6,60 +6,66 @@ mod data; #[bench] fn symbol_parse_empty(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse_bytes(data::EMPTY.as_bytes()).last()); + let parser = typos::tokens::Parser::new(); + b.iter(|| parser.parse_bytes(data::EMPTY.as_bytes()).last()); } #[bench] fn symbol_parse_no_tokens(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse_bytes(data::NO_TOKENS.as_bytes()).last()); + let parser = typos::tokens::Parser::new(); + b.iter(|| parser.parse_bytes(data::NO_TOKENS.as_bytes()).last()); } #[bench] fn symbol_parse_single_token(b: &mut test::Bencher) { + let parser = typos::tokens::Parser::new(); b.iter(|| { - typos::tokens::Identifier::parse_bytes(data::SINGLE_TOKEN.as_bytes()).last(); + parser.parse_bytes(data::SINGLE_TOKEN.as_bytes()).last(); }); } #[bench] fn symbol_parse_sherlock(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse_bytes(data::SHERLOCK.as_bytes()).last()); + let parser = typos::tokens::Parser::new(); + b.iter(|| parser.parse_bytes(data::SHERLOCK.as_bytes()).last()); } #[bench] fn symbol_parse_code(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse_bytes(data::CODE.as_bytes()).last()); + let parser = typos::tokens::Parser::new(); + b.iter(|| parser.parse_bytes(data::CODE.as_bytes()).last()); } #[bench] fn symbol_parse_corpus(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse_bytes(data::CORPUS.as_bytes()).last()); + let parser = typos::tokens::Parser::new(); + b.iter(|| parser.parse_bytes(data::CORPUS.as_bytes()).last()); } #[bench] fn symbol_split_lowercase_short(b: &mut test::Bencher) { let input = "abcabcabcabc"; - let symbol = typos::tokens::Identifier::new(input, 0).unwrap(); + let symbol = typos::tokens::Identifier::new_unchecked(input, 0); b.iter(|| symbol.split().last()); } #[bench] fn symbol_split_lowercase_long(b: &mut test::Bencher) { let input = "abcabcabcabc".repeat(90); - let symbol = typos::tokens::Identifier::new(&input, 0).unwrap(); + let symbol = typos::tokens::Identifier::new_unchecked(&input, 0); b.iter(|| symbol.split().last()); } #[bench] fn symbol_split_mixed_short(b: &mut test::Bencher) { let input = "abcABCAbc123"; - let symbol = typos::tokens::Identifier::new(input, 0).unwrap(); + let symbol = typos::tokens::Identifier::new_unchecked(input, 0); b.iter(|| symbol.split().last()); } #[bench] fn symbol_split_mixed_long(b: &mut test::Bencher) { let input = "abcABCAbc123".repeat(90); - let symbol = typos::tokens::Identifier::new(&input, 0).unwrap(); + let symbol = typos::tokens::Identifier::new_unchecked(&input, 0); b.iter(|| symbol.split().last()); } diff --git a/src/lib.rs b/src/lib.rs index 7ef5ea5..cc0a65f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,11 +23,12 @@ pub fn process_file( binary: bool, report: report::Report, ) -> Result { + let parser = tokens::Parser::new(); let mut typos_found = false; if check_filenames { for part in path.components().filter_map(|c| c.as_os_str().to_str()) { - for ident in tokens::Identifier::parse(part) { + for ident in parser.parse(part) { if !ignore_hex && is_hex(ident.token()) { continue; } @@ -71,7 +72,7 @@ pub fn process_file( for (line_idx, line) in buffer.lines().enumerate() { let line_num = line_idx + 1; - for ident in tokens::Identifier::parse_bytes(line) { + for ident in parser.parse_bytes(line) { if !ignore_hex && is_hex(ident.token()) { continue; } diff --git a/src/tokens.rs b/src/tokens.rs index 2d8c09a..46ce93e 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -6,6 +6,37 @@ pub enum Case { None, } +#[derive(Debug, Clone)] +pub struct Parser { + words_str: regex::Regex, + words_bytes: regex::bytes::Regex, +} + +impl Parser { + pub fn new() -> Self { + let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#; + let words_str = regex::Regex::new(pattern).unwrap(); + let words_bytes = regex::bytes::Regex::new(pattern).unwrap(); + Self { + words_str, + words_bytes, + } + } + + pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator> { + self.words_str + .find_iter(content) + .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) + } + + pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { + self.words_bytes.find_iter(content).filter_map(|m| { + let s = std::str::from_utf8(m.as_bytes()).ok(); + s.map(|s| Identifier::new_unchecked(s, m.start())) + }) + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { token: &'t str, @@ -13,54 +44,10 @@ pub struct Identifier<'t> { } impl<'t> Identifier<'t> { - pub fn new(token: &'t str, offset: usize) -> Result { - let mut itr = Self::parse_bytes(token.as_bytes()); - let mut item = itr - .next() - .ok_or_else(|| failure::format_err!("Invalid ident (none found): {:?}", token))?; - if item.offset != 0 { - return Err(failure::format_err!( - "Invalid ident (padding found): {:?}", - token - )); - } - item.offset += offset; - if itr.next().is_some() { - return Err(failure::format_err!( - "Invalid ident (contains more than one): {:?}", - token - )); - } - Ok(item) - } - - pub(crate) fn new_unchecked(token: &'t str, offset: usize) -> Self { + pub fn new_unchecked(token: &'t str, offset: usize) -> Self { Self { token, offset } } - pub fn parse(content: &str) -> impl Iterator> { - lazy_static::lazy_static! { - // Getting false positives for this lint - #[allow(clippy::invalid_regex)] - static ref SPLIT: regex::Regex = regex::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap(); - } - SPLIT - .find_iter(content) - .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) - } - - pub fn parse_bytes(content: &[u8]) -> impl Iterator> { - lazy_static::lazy_static! { - // Getting false positives for this lint - #[allow(clippy::invalid_regex)] - static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap(); - } - SPLIT.find_iter(content).filter_map(|m| { - let s = std::str::from_utf8(m.as_bytes()).ok(); - s.map(|s| Identifier::new_unchecked(s, m.start())) - }) - } - pub fn token(&self) -> &str { self.token } @@ -87,7 +74,6 @@ pub struct Word<'t> { impl<'t> Word<'t> { pub fn new(token: &'t str, offset: usize) -> Result { - Identifier::new(token, offset)?; let mut itr = split_ident(token, 0); let mut item = itr .next() @@ -108,7 +94,7 @@ impl<'t> Word<'t> { Ok(item) } - pub(crate) fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self { + pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self { Self { token, case, @@ -251,70 +237,82 @@ mod test { #[test] fn tokenize_empty_is_empty() { + let parser = Parser::new(); + let input = ""; let expected: Vec = vec![]; - let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); - let actual: Vec<_> = Identifier::parse(input).collect(); + let actual: Vec<_> = parser.parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_word_is_word() { + let parser = Parser::new(); + let input = "word"; let expected: Vec = vec![Identifier::new_unchecked("word", 0)]; - let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); - let actual: Vec<_> = Identifier::parse(input).collect(); + let actual: Vec<_> = parser.parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_space_separated_words() { + let parser = Parser::new(); + let input = "A B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 2), ]; - let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); - let actual: Vec<_> = Identifier::parse(input).collect(); + let actual: Vec<_> = parser.parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_dot_separated_words() { + let parser = Parser::new(); + let input = "A.B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 2), ]; - let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); - let actual: Vec<_> = Identifier::parse(input).collect(); + let actual: Vec<_> = parser.parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_namespace_separated_words() { + let parser = Parser::new(); + let input = "A::B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 3), ]; - let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); - let actual: Vec<_> = Identifier::parse(input).collect(); + let actual: Vec<_> = parser.parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_underscore_doesnt_separate() { + let parser = Parser::new(); + let input = "A_B"; let expected: Vec = vec![Identifier::new_unchecked("A_B", 0)]; - let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); - let actual: Vec<_> = Identifier::parse(input).collect(); + let actual: Vec<_> = parser.parse(input).collect(); assert_eq!(expected, actual); } @@ -375,7 +373,7 @@ mod test { ), ]; for (input, expected) in cases.iter() { - let ident = Identifier::new(input, 0).unwrap(); + let ident = Identifier::new_unchecked(input, 0); let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect(); assert_eq!(&result, expected); }