From 85082cdbb1fbcdc176bb88db4321ed2eac10a715 Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Tue, 29 Jun 2021 12:00:58 -0500
Subject: [PATCH] feat(parser): Ignore UUIDs

We might be able to make this bail our earlier and not accidentally
detect the wrong thing by checking if the hex values are lowercase.  RFC
4122 says that UUIDs must be generated lowecase, while input accepts
any case.  The main issues are risk on the "input" part and the extra
annoyance of writing a custm `is_hex_digit` function.
---
 crates/typos/src/tokens.rs | 53 +++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)
diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs
index 941f95d..16943f9 100644
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@@ -130,6 +130,7 @@ mod parser {
     use nom::branch::*;
     use nom::bytes::complete::*;
     use nom::character::complete::*;
+    use nom::combinator::*;
     use nom::sequence::*;
     use nom::{AsChar, IResult};
 
@@ -140,6 +141,7 @@ mod parser {
             + nom::InputIter
             + nom::InputLength
             + nom::Slice<std::ops::RangeFrom<usize>>
+            + nom::Slice<std::ops::RangeTo<usize>>
             + nom::Offset
             + Clone
             + PartialEq
@@ -169,6 +171,7 @@ mod parser {
             + nom::InputIter
             + nom::InputLength
             + nom::Slice<std::ops::RangeFrom<usize>>
+            + nom::Slice<std::ops::RangeTo<usize>>
             + nom::Offset
             + Clone
             + PartialEq
@@ -178,6 +181,7 @@ mod parser {
     {
         take_many0(alt((
             sep1,
+            terminated(uuid_literal, sep1),
             terminated(hex_literal, sep1),
             terminated(dec_literal, sep1),
         )))(input)
@@ -196,7 +200,7 @@ mod parser {
         T: nom::InputTakeAtPosition,
         <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
     {
-        take_while1(is_dec_digit)(input)
+        take_while1(is_dec_digit_with_sep)(input)
     }
 
     fn hex_literal<T>(input: T) -> IResult<T, T>
@@ -212,10 +216,36 @@ mod parser {
     {
         preceded(
             pair(char('0'), alt((char('x'), char('X')))),
-            take_while1(is_hex_digit),
+            take_while1(is_hex_digit_with_sep),
         )(input)
     }
 
+    fn uuid_literal<T>(input: T) -> IResult<T, T>
+    where
+        T: nom::InputTakeAtPosition
+            + nom::InputTake
+            + nom::InputIter
+            + nom::InputLength
+            + nom::Offset
+            + nom::Slice<std::ops::RangeTo<usize>>
+            + nom::Slice<std::ops::RangeFrom<usize>>
+            + Clone,
+        <T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
+        <T as nom::InputIter>::Item: AsChar + Copy,
+    {
+        recognize(tuple((
+            take_while_m_n(8, 8, AsChar::is_hex_digit),
+            char('-'),
+            take_while_m_n(4, 4, AsChar::is_hex_digit),
+            char('-'),
+            take_while_m_n(4, 4, AsChar::is_hex_digit),
+            char('-'),
+            take_while_m_n(4, 4, AsChar::is_hex_digit),
+            char('-'),
+            take_while_m_n(12, 12, AsChar::is_hex_digit),
+        )))(input)
+    }
+
     fn take_many0<I, E, F>(mut f: F) -> impl FnMut(I) -> IResult<I, I, E>
     where
         I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug,
@@ -249,11 +279,11 @@ mod parser {
         }
     }
 
-    fn is_dec_digit(i: impl AsChar + Copy) -> bool {
+    fn is_dec_digit_with_sep(i: impl AsChar + Copy) -> bool {
         i.is_dec_digit() || is_digit_sep(i.as_char())
     }
 
-    fn is_hex_digit(i: impl AsChar + Copy) -> bool {
+    fn is_hex_digit_with_sep(i: impl AsChar + Copy) -> bool {
         i.is_hex_digit() || is_digit_sep(i.as_char())
     }
 
@@ -646,6 +676,21 @@ mod test {
         assert_eq!(expected, actual);
     }
 
+    #[test]
+    fn tokenize_ignore_uuid() {
+        let parser = TokenizerBuilder::new().build();
+
+        let input = "Hello 123e4567-e89b-12d3-a456-426652340000 World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("World", Case::None, 43),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse_str(input).collect();
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn tokenize_leading_digits() {
         let parser = TokenizerBuilder::new().build();