Merge pull request #930 from epage/fix

fix(token): Don't crash on parsing unicode
This commit is contained in:
Ed Page 2024-02-08 07:22:58 -06:00 committed by GitHub
commit bf9ee1f3b1
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: B5690EEEBB952194

View file

@ -159,7 +159,12 @@ mod parser {
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse
trace("identifier", take_while(1.., is_xid_continue)).parse_next(input)
trace(
"identifier",
take_while(1.., is_xid_continue)
.verify(|s: &<T as Stream>::Slice| std::str::from_utf8(s.as_bstr()).is_ok()),
)
.parse_next(input)
}
fn ignore<T>(input: &mut T) -> PResult<<T as Stream>::Slice, ()>
@ -1310,6 +1315,18 @@ mod test {
assert_eq!(expected, actual);
}
#[test]
fn tokenize_unicode_without_unicode() {
let parser = TokenizerBuilder::new().unicode(false).build();
let input = "appliqués";
let expected: Vec<Identifier> = vec![];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn split_ident() {
let cases = [
@ -1365,6 +1382,7 @@ mod test {
"BFG9000",
&[("BFG", Case::Upper, 0), ("9000", Case::None, 3)],
),
("appliqués", &[("appliqués", Case::Lower, 0)]),
];
for (input, expected) in cases.iter() {
let ident = Identifier::new_unchecked(input, Case::None, 0);