From 2202b7f661889e91cafea56ddd510f2d60f53823 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Fri, 30 Jul 2021 11:30:05 -0500 Subject: [PATCH] fix(parser): Handle c-escape/printf Since our goal is 100% confidence in the results, its better to not check words than to correct the wrong words. With that in mind, we'll ignore words after what might be c-escape sequences (`\nfoo`) or printf substitutions (`%dfoo`). Fixes #3 --- CHANGELOG.md | 4 ++ crates/typos/src/tokens.rs | 77 +++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fb59d9..ccd26b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] - ReleaseDate +#### Bug Fixes + +- Reduce false-positives by ignoring words following possible c-escape sequences or printf patterns. + ## [1.1.2] - 2021-07-30 #### Bug Fixes diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 107fe75..46865f4 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -175,6 +175,8 @@ mod parser { ::Item: AsChar + Copy, { take_many0(alt(( + // CAUTION: If adding an ignorable literal, if it doesn't start with `is_xid_continue`, + // then you need to update `is_ignore_char` to make sure `sep1` doesn't eat it all up. terminated(uuid_literal, sep1), terminated(hash_literal, sep1), terminated(hex_literal, sep1), @@ -182,6 +184,8 @@ mod parser { terminated(base64_literal, sep1), terminated(email_literal, sep1), terminated(url_literal, sep1), + terminated(c_escape, sep1), + terminated(printf, sep1), sep1, )))(input) } @@ -191,7 +195,7 @@ mod parser { T: nom::InputTakeAtPosition, ::Item: AsChar + Copy, { - take_till1(is_xid_continue)(input) + take_while1(is_ignore_char)(input) } fn dec_literal(input: T) -> IResult @@ -355,6 +359,40 @@ mod parser { )))(input) } + fn c_escape(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + std::fmt::Debug + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + preceded(char('\\'), take_while1(is_xid_continue))(input) + } + + fn printf(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + std::fmt::Debug + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + preceded(char('%'), take_while1(is_xid_continue))(input) + } + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult where I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, @@ -444,6 +482,13 @@ mod parser { ('a'..='z').contains(&c) || ('0'..='9').contains(&c) || "+.-".find(c).is_some() } + #[inline] + fn is_ignore_char(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + // See c_escape and printf + !unicode_xid::UnicodeXID::is_xid_continue(c) && c != '\\' && c != '%' + } + #[inline] fn is_xid_continue(i: impl AsChar + Copy) -> bool { let c = i.as_char(); @@ -940,6 +985,36 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_c_escape() { + let parser = TokenizerBuilder::new().build(); + + let input = "Hello \\Hello World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 13), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_printf() { + let parser = TokenizerBuilder::new().build(); + + let input = "Hello %Hello World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 13), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn split_ident() { let cases = [