From 09203fd592a4716bbe6e70a527c333d8b8ee418d Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 14 Feb 2022 08:21:56 -0600 Subject: [PATCH 1/2] fix(parser): Recognize URLs with passwords --- crates/typos/src/tokens.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 46490f5..97e5fae 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -422,7 +422,7 @@ mod parser { tuple((char(':'), char('/'), char('/'))), )), tuple(( - opt(terminated(take_while1(is_localport_char), char('@'))), + opt(terminated(url_userinfo, char('@'))), take_while1(is_domain_char), opt(preceded(char(':'), take_while1(AsChar::is_dec_digit))), )), @@ -432,6 +432,26 @@ mod parser { )))(input) } + fn url_userinfo(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + Clone + + std::fmt::Debug, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + recognize(tuple(( + take_while1(is_localport_char), + opt(preceded(char(':'), take_while(is_localport_char))), + )))(input) + } + fn c_escape(input: T) -> IResult where T: nom::InputTakeAtPosition @@ -1113,10 +1133,11 @@ mod test { fn tokenize_ignore_max_url() { let parser = TokenizerBuilder::new().build(); - let input = "Good http://user@example.com:3142/hello?query=value&extra=two#fragment Bye"; + let input = + "Good http://user:password@example.com:3142/hello?query=value&extra=two#fragment Bye"; let expected: Vec = vec![ Identifier::new_unchecked("Good", Case::None, 0), - Identifier::new_unchecked("Bye", Case::None, 71), + Identifier::new_unchecked("Bye", Case::None, 80), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); From c3bb4adfa1b840e4a79f23924c2a872befecc37d Mon Sep 17 00:00:00 2001 From: Ed Page Date: Mon, 14 Feb 2022 08:49:53 -0600 Subject: [PATCH 2/2] fix(parser): Allow commas in urls Got us closer to https://www.ietf.org/rfc/rfc3986.txt Fixes #433 --- crates/typos/src/tokens.rs | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 97e5fae..ba2b5e7 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -428,7 +428,7 @@ mod parser { )), char('/'), // HACK: Too lazy to enumerate - take_while(is_localport_char), + take_while(is_path_query_fragment), )))(input) } @@ -584,6 +584,33 @@ mod parser { || "-().".find(c).is_some() } + #[inline] + fn is_path_query_fragment(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + is_pchar(c) || "/?#".find(c).is_some() + } + + #[inline] + fn is_pchar(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + is_uri_unreserved(c) || is_uri_sub_delims(c) || "%:@".find(c).is_some() + } + + #[inline] + fn is_uri_unreserved(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='z').contains(&c) + || ('A'..='Z').contains(&c) + || ('0'..='9').contains(&c) + || "-._~".find(c).is_some() + } + + #[inline] + fn is_uri_sub_delims(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + "!$&'()*+,;=".find(c).is_some() + } + #[inline] fn is_scheme_char(i: impl AsChar + Copy) -> bool { let c = i.as_char(); @@ -1134,10 +1161,10 @@ mod test { let parser = TokenizerBuilder::new().build(); let input = - "Good http://user:password@example.com:3142/hello?query=value&extra=two#fragment Bye"; + "Good http://user:password@example.com:3142/hello?query=value&extra=two#fragment,split Bye"; let expected: Vec = vec![ Identifier::new_unchecked("Good", Case::None, 0), - Identifier::new_unchecked("Bye", Case::None, 80), + Identifier::new_unchecked("Bye", Case::None, 86), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual);