From 2202b7f661889e91cafea56ddd510f2d60f53823 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Fri, 30 Jul 2021 11:30:05 -0500
Subject: [PATCH] fix(parser): Handle c-escape/printf

Since our goal is 100% confidence in the results, its better to not
check words than to correct the wrong words.

With that in mind, we'll ignore words after what might be c-escape
sequences (`\nfoo`) or printf substitutions (`%dfoo`).

Fixes #3
---
 CHANGELOG.md               |  4 ++
 crates/typos/src/tokens.rs | 77 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8fb59d9..ccd26b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 <!-- next-header -->
 ## [Unreleased] - ReleaseDate
 
+#### Bug Fixes
+
+- Reduce false-positives by ignoring words following possible c-escape sequences or printf patterns.
+
 ## [1.1.2] - 2021-07-30
 
 #### Bug Fixes
diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs
index 107fe75..46865f4 100644
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@@ -175,6 +175,8 @@ mod parser {
         <T as nom::InputIter>::Item: AsChar + Copy,
     {
         take_many0(alt((
+            // CAUTION: If adding an ignorable literal, if it doesn't start with `is_xid_continue`,
+            // then you need to update `is_ignore_char` to make sure `sep1` doesn't eat it all up.
             terminated(uuid_literal, sep1),
             terminated(hash_literal, sep1),
             terminated(hex_literal, sep1),
@@ -182,6 +184,8 @@ mod parser {
             terminated(base64_literal, sep1),
             terminated(email_literal, sep1),
             terminated(url_literal, sep1),
+            terminated(c_escape, sep1),
+            terminated(printf, sep1),
             sep1,
         )))(input)
     }
@@ -191,7 +195,7 @@ mod parser {
         T: nom::InputTakeAtPosition,
         <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
     {
-        take_till1(is_xid_continue)(input)
+        take_while1(is_ignore_char)(input)
     }
 
     fn dec_literal<T>(input: T) -> IResult<T, T>
@@ -355,6 +359,40 @@ mod parser {
         )))(input)
     }
 
+    fn c_escape<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition
+            + nom::InputTake
+            + nom::InputIter
+            + nom::InputLength
+            + nom::Offset
+            + nom::Slice<std::ops::RangeTo<usize>>
+            + nom::Slice<std::ops::RangeFrom<usize>>
+            + std::fmt::Debug
+            + Clone,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+        <T as nom::InputIter>::Item: AsChar + Copy,
+    {
+        preceded(char('\\'), take_while1(is_xid_continue))(input)
+    }
+
+    fn printf<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition
+            + nom::InputTake
+            + nom::InputIter
+            + nom::InputLength
+            + nom::Offset
+            + nom::Slice<std::ops::RangeTo<usize>>
+            + nom::Slice<std::ops::RangeFrom<usize>>
+            + std::fmt::Debug
+            + Clone,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+        <T as nom::InputIter>::Item: AsChar + Copy,
+    {
+        preceded(char('%'), take_while1(is_xid_continue))(input)
+    }
+
     fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
     where
         I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
@@ -444,6 +482,13 @@ mod parser {
         ('a'..='z').contains(&c) || ('0'..='9').contains(&c) || "+.-".find(c).is_some()
     }
 
+    #[inline]
+    fn is_ignore_char(i: impl AsChar + Copy) -> bool {
+        let c = i.as_char();
+        // See c_escape and printf
+        !unicode_xid::UnicodeXID::is_xid_continue(c) && c != '\\' && c != '%'
+    }
+
     #[inline]
     fn is_xid_continue(i: impl AsChar + Copy) -> bool {
         let c = i.as_char();
@@ -940,6 +985,36 @@ mod test {
         assert_eq!(expected, actual);
     }
 
+    #[test]
+    fn tokenize_c_escape() {
+        let parser = TokenizerBuilder::new().build();
+
+        let input = "Hello \\Hello World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("World", Case::None, 13),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse_str(input).collect();
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn tokenize_printf() {
+        let parser = TokenizerBuilder::new().build();
+
+        let input = "Hello %Hello World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("World", Case::None, 13),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse_str(input).collect();
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn split_ident() {
         let cases = [