From d8ca9f9d5a39ef2dc7495d1d69dc7a5f04613791 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Wed, 23 Jan 2019 07:45:51 -0700 Subject: [PATCH] fix: Limit words to just identifiers --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index b04d263..19408e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,7 @@ impl<'t> Token<'t> { pub fn tokenize(content: &[u8]) -> impl Iterator { lazy_static::lazy_static! { - static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b\w+\b"#).unwrap(); + static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); } SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start())) }