From 8879269b0d6c74ac3aec50c2beed5eb4bb16669a Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 8 Feb 2024 07:10:28 -0600 Subject: [PATCH] fix(token): Don't crash on parsing unicode --- crates/typos/src/tokens.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 796de44..8b3332e 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -159,7 +159,12 @@ mod parser { // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd // or unexpected cases than strip off start characters to a word since we aren't doing a // proper word boundary parse - trace("identifier", take_while(1.., is_xid_continue)).parse_next(input) + trace( + "identifier", + take_while(1.., is_xid_continue) + .verify(|s: &::Slice| std::str::from_utf8(s.as_bstr()).is_ok()), + ) + .parse_next(input) } fn ignore(input: &mut T) -> PResult<::Slice, ()> @@ -1310,6 +1315,18 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_unicode_without_unicode() { + let parser = TokenizerBuilder::new().unicode(false).build(); + + let input = "appliqués"; + let expected: Vec = vec![]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn split_ident() { let cases = [ @@ -1365,6 +1382,7 @@ mod test { "BFG9000", &[("BFG", Case::Upper, 0), ("9000", Case::None, 3)], ), + ("appliqués", &[("appliqués", Case::Lower, 0)]), ]; for (input, expected) in cases.iter() { let ident = Identifier::new_unchecked(input, Case::None, 0);