fix(token): Don't crash on parsing unicode

This commit is contained in:
Ed Page 2024-02-08 07:10:28 -06:00
parent 4c248c85ec
commit 8879269b0d

View file

@ -159,7 +159,12 @@ mod parser {
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a // or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse // proper word boundary parse
trace("identifier", take_while(1.., is_xid_continue)).parse_next(input) trace(
"identifier",
take_while(1.., is_xid_continue)
.verify(|s: &<T as Stream>::Slice| std::str::from_utf8(s.as_bstr()).is_ok()),
)
.parse_next(input)
} }
fn ignore<T>(input: &mut T) -> PResult<<T as Stream>::Slice, ()> fn ignore<T>(input: &mut T) -> PResult<<T as Stream>::Slice, ()>
@ -1310,6 +1315,18 @@ mod test {
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test]
fn tokenize_unicode_without_unicode() {
let parser = TokenizerBuilder::new().unicode(false).build();
let input = "appliqués";
let expected: Vec<Identifier> = vec![];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test] #[test]
fn split_ident() { fn split_ident() {
let cases = [ let cases = [
@ -1365,6 +1382,7 @@ mod test {
"BFG9000", "BFG9000",
&[("BFG", Case::Upper, 0), ("9000", Case::None, 3)], &[("BFG", Case::Upper, 0), ("9000", Case::None, 3)],
), ),
("appliqués", &[("appliqués", Case::Lower, 0)]),
]; ];
for (input, expected) in cases.iter() { for (input, expected) in cases.iter() {
let ident = Identifier::new_unchecked(input, Case::None, 0); let ident = Identifier::new_unchecked(input, Case::None, 0);