diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 85d0c31..cdbaea5 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -186,6 +186,7 @@ mod parser { terminated(dec_literal, sep1), terminated(base64_literal, sep1), terminated(email_literal, sep1), + terminated(url_literal, sep1), sep1, )))(input) } @@ -321,9 +322,41 @@ mod parser { ::Item: AsChar + Copy, { recognize(tuple(( - take_while1(is_email_localport_char), + take_while1(is_localport_char), char('@'), - take_while1(is_email_domain_char), + take_while1(is_domain_char), + )))(input) + } + + fn url_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + std::fmt::Debug + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + recognize(tuple(( + opt(terminated( + take_while1(is_scheme_char), + // HACK: Technically you can skip `//` if you don't have a domain but that would + // get messy to support. + tuple((char(':'), char('/'), char('/'))), + )), + tuple(( + opt(terminated(take_while1(is_localport_char), char('@'))), + take_while1(is_domain_char), + opt(preceded(char(':'), take_while1(AsChar::is_dec_digit))), + )), + char('/'), + // HACK: Too lazy to enumerate + take_while(is_localport_char), )))(input) } @@ -393,7 +426,7 @@ mod parser { } #[inline] - fn is_email_localport_char(i: impl AsChar + Copy) -> bool { + fn is_localport_char(i: impl AsChar + Copy) -> bool { let c = i.as_char(); ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) @@ -402,7 +435,7 @@ mod parser { } #[inline] - fn is_email_domain_char(i: impl AsChar + Copy) -> bool { + fn is_domain_char(i: impl AsChar + Copy) -> bool { let c = i.as_char(); ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) @@ -410,6 +443,12 @@ mod parser { || "-().".find(c).is_some() } + #[inline] + fn is_scheme_char(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='z').contains(&c) || ('0'..='9').contains(&c) || "+.-".find(c).is_some() + } + #[inline] fn is_xid_continue(i: impl AsChar + Copy) -> bool { let c = i.as_char(); @@ -860,6 +899,36 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_min_url() { + let parser = TokenizerBuilder::new().build(); + + let input = "Good example.com/hello Bye"; + let expected: Vec = vec![ + Identifier::new_unchecked("Good", Case::None, 0), + Identifier::new_unchecked("Bye", Case::None, 23), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_ignore_max_url() { + let parser = TokenizerBuilder::new().build(); + + let input = "Good http://user@example.com:3142/hello?query=value&extra=two#fragment Bye"; + let expected: Vec = vec![ + Identifier::new_unchecked("Good", Case::None, 0), + Identifier::new_unchecked("Bye", Case::None, 71), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn tokenize_leading_digits() { let parser = TokenizerBuilder::new().build();