From 3cf9d8672c8fabe9b93f4453d4dbfa8cece8b1a3 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Wed, 24 Jul 2019 06:47:50 -0600
Subject: [PATCH] refactor(parser): Move hex handling to parser

---
 src/lib.rs    | 17 +-------------
 src/tokens.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 61 insertions(+), 21 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index cc0a65f..f357182 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,15 +23,12 @@ pub fn process_file(
     binary: bool,
     report: report::Report,
 ) -> Result<bool, failure::Error> {
-    let parser = tokens::Parser::new();
+    let parser = tokens::ParserBuilder::new().ignore_hex(ignore_hex).build();
     let mut typos_found = false;
 
     if check_filenames {
         for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
             for ident in parser.parse(part) {
-                if !ignore_hex && is_hex(ident.token()) {
-                    continue;
-                }
                 if let Some(correction) = dictionary.correct_ident(ident) {
                     let msg = report::FilenameCorrection {
                         path,
@@ -73,9 +70,6 @@ pub fn process_file(
         for (line_idx, line) in buffer.lines().enumerate() {
             let line_num = line_idx + 1;
             for ident in parser.parse_bytes(line) {
-                if !ignore_hex && is_hex(ident.token()) {
-                    continue;
-                }
                 if let Some(correction) = dictionary.correct_ident(ident) {
                     let col_num = ident.offset();
                     let msg = report::Correction {
@@ -112,12 +106,3 @@ pub fn process_file(
 
     Ok(typos_found)
 }
-
-fn is_hex(ident: &str) -> bool {
-    lazy_static::lazy_static! {
-        // `_`: number literal separator in Rust and other languages
-        // `'`: number literal separator in C++
-        static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
-    }
-    HEX.is_match(ident)
-}
diff --git a/src/tokens.rs b/src/tokens.rs
index 23d1d1d..c0071ac 100644
--- a/src/tokens.rs
+++ b/src/tokens.rs
@@ -7,13 +7,20 @@ pub enum Case {
 }
 
 #[derive(Debug, Clone, Default)]
-pub struct ParserBuilder {}
+pub struct ParserBuilder {
+    ignore_hex: bool,
+}
 
 impl ParserBuilder {
     pub fn new() -> Self {
         Default::default()
     }
 
+    pub fn ignore_hex(mut self, yes: bool) -> Self {
+        self.ignore_hex = yes;
+        self
+    }
+
     pub fn build(self) -> Parser {
         let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#;
         let words_str = regex::Regex::new(pattern).unwrap();
@@ -21,6 +28,7 @@ impl ParserBuilder {
         Parser {
             words_str,
             words_bytes,
+            ignore_hex: self.ignore_hex,
         }
     }
 }
@@ -29,6 +37,7 @@ impl ParserBuilder {
 pub struct Parser {
     words_str: regex::Regex,
     words_bytes: regex::bytes::Regex,
+    ignore_hex: bool,
 }
 
 impl Parser {
@@ -37,16 +46,22 @@ impl Parser {
     }
 
     pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
+        let ignore_hex = self.ignore_hex;
         self.words_str
             .find_iter(content)
+            .filter(move |m| !ignore_hex || !is_hex(m.as_str().as_bytes()))
             .map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
     }
 
     pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
-        self.words_bytes.find_iter(content).filter_map(|m| {
-            let s = std::str::from_utf8(m.as_bytes()).ok();
-            s.map(|s| Identifier::new_unchecked(s, m.start()))
-        })
+        let ignore_hex = self.ignore_hex;
+        self.words_bytes
+            .find_iter(content)
+            .filter(move |m| !ignore_hex || !is_hex(m.as_bytes()))
+            .filter_map(|m| {
+                let s = std::str::from_utf8(m.as_bytes()).ok();
+                s.map(|s| Identifier::new_unchecked(s, m.start()))
+            })
     }
 }
 
@@ -56,6 +71,15 @@ impl Default for Parser {
     }
 }
 
+fn is_hex(ident: &[u8]) -> bool {
+    lazy_static::lazy_static! {
+        // `_`: number literal separator in Rust and other languages
+        // `'`: number literal separator in C++
+        static ref HEX: regex::bytes::Regex = regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
+    }
+    HEX.is_match(ident)
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Identifier<'t> {
     token: &'t str,
@@ -335,6 +359,37 @@ mod test {
         assert_eq!(expected, actual);
     }
 
+    #[test]
+    fn tokenize_ignore_hex_enabled() {
+        let parser = ParserBuilder::new().ignore_hex(true).build();
+
+        let input = "Hello 0xDEADBEEF World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", 0),
+            Identifier::new_unchecked("World", 17),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse(input).collect();
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn tokenize_ignore_hex_disabled() {
+        let parser = ParserBuilder::new().ignore_hex(false).build();
+
+        let input = "Hello 0xDEADBEEF World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", 0),
+            Identifier::new_unchecked("0xDEADBEEF", 6),
+            Identifier::new_unchecked("World", 17),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse(input).collect();
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn split_ident() {
         let cases = [