From 3cf9d8672c8fabe9b93f4453d4dbfa8cece8b1a3 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Wed, 24 Jul 2019 06:47:50 -0600 Subject: [PATCH] refactor(parser): Move hex handling to parser --- src/lib.rs | 17 +------------- src/tokens.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 21 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index cc0a65f..f357182 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,15 +23,12 @@ pub fn process_file( binary: bool, report: report::Report, ) -> Result { - let parser = tokens::Parser::new(); + let parser = tokens::ParserBuilder::new().ignore_hex(ignore_hex).build(); let mut typos_found = false; if check_filenames { for part in path.components().filter_map(|c| c.as_os_str().to_str()) { for ident in parser.parse(part) { - if !ignore_hex && is_hex(ident.token()) { - continue; - } if let Some(correction) = dictionary.correct_ident(ident) { let msg = report::FilenameCorrection { path, @@ -73,9 +70,6 @@ pub fn process_file( for (line_idx, line) in buffer.lines().enumerate() { let line_num = line_idx + 1; for ident in parser.parse_bytes(line) { - if !ignore_hex && is_hex(ident.token()) { - continue; - } if let Some(correction) = dictionary.correct_ident(ident) { let col_num = ident.offset(); let msg = report::Correction { @@ -112,12 +106,3 @@ pub fn process_file( Ok(typos_found) } - -fn is_hex(ident: &str) -> bool { - lazy_static::lazy_static! { - // `_`: number literal separator in Rust and other languages - // `'`: number literal separator in C++ - static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap(); - } - HEX.is_match(ident) -} diff --git a/src/tokens.rs b/src/tokens.rs index 23d1d1d..c0071ac 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -7,13 +7,20 @@ pub enum Case { } #[derive(Debug, Clone, Default)] -pub struct ParserBuilder {} +pub struct ParserBuilder { + ignore_hex: bool, +} impl ParserBuilder { pub fn new() -> Self { Default::default() } + pub fn ignore_hex(mut self, yes: bool) -> Self { + self.ignore_hex = yes; + self + } + pub fn build(self) -> Parser { let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#; let words_str = regex::Regex::new(pattern).unwrap(); @@ -21,6 +28,7 @@ impl ParserBuilder { Parser { words_str, words_bytes, + ignore_hex: self.ignore_hex, } } } @@ -29,6 +37,7 @@ impl ParserBuilder { pub struct Parser { words_str: regex::Regex, words_bytes: regex::bytes::Regex, + ignore_hex: bool, } impl Parser { @@ -37,16 +46,22 @@ impl Parser { } pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator> { + let ignore_hex = self.ignore_hex; self.words_str .find_iter(content) + .filter(move |m| !ignore_hex || !is_hex(m.as_str().as_bytes())) .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) } pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { - self.words_bytes.find_iter(content).filter_map(|m| { - let s = std::str::from_utf8(m.as_bytes()).ok(); - s.map(|s| Identifier::new_unchecked(s, m.start())) - }) + let ignore_hex = self.ignore_hex; + self.words_bytes + .find_iter(content) + .filter(move |m| !ignore_hex || !is_hex(m.as_bytes())) + .filter_map(|m| { + let s = std::str::from_utf8(m.as_bytes()).ok(); + s.map(|s| Identifier::new_unchecked(s, m.start())) + }) } } @@ -56,6 +71,15 @@ impl Default for Parser { } } +fn is_hex(ident: &[u8]) -> bool { + lazy_static::lazy_static! { + // `_`: number literal separator in Rust and other languages + // `'`: number literal separator in C++ + static ref HEX: regex::bytes::Regex = regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap(); + } + HEX.is_match(ident) +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { token: &'t str, @@ -335,6 +359,37 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_hex_enabled() { + let parser = ParserBuilder::new().ignore_hex(true).build(); + + let input = "Hello 0xDEADBEEF World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", 0), + Identifier::new_unchecked("World", 17), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_ignore_hex_disabled() { + let parser = ParserBuilder::new().ignore_hex(false).build(); + + let input = "Hello 0xDEADBEEF World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", 0), + Identifier::new_unchecked("0xDEADBEEF", 6), + Identifier::new_unchecked("World", 17), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn split_ident() { let cases = [