Merge pull request #65 from epage/digits

fix: Ignore numbers as identifiers
This commit is contained in:
Ed Page 2019-11-01 20:13:21 -06:00 committed by GitHub
commit c05ab4f9dc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -50,7 +50,7 @@ impl ParserBuilder {
Parser { Parser {
words_str, words_str,
words_bytes, words_bytes,
ignore_hex: self.ignore_hex, ignore_hex: self.ignore_hex && self.include_digits,
} }
} }
} }
@ -78,23 +78,33 @@ impl Parser {
} }
pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> { pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let ignore_hex = self.ignore_hex;
self.words_str self.words_str
.find_iter(content) .find_iter(content)
.filter(move |m| !ignore_hex || !is_hex(m.as_str().as_bytes())) .filter(move |m| self.accept(m.as_str().as_bytes()))
.map(|m| Identifier::new_unchecked(m.as_str(), m.start())) .map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
} }
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
let ignore_hex = self.ignore_hex;
self.words_bytes self.words_bytes
.find_iter(content) .find_iter(content)
.filter(move |m| !ignore_hex || !is_hex(m.as_bytes())) .filter(move |m| self.accept(m.as_bytes()))
.filter_map(|m| { .filter_map(|m| {
let s = std::str::from_utf8(m.as_bytes()).ok(); let s = std::str::from_utf8(m.as_bytes()).ok();
s.map(|s| Identifier::new_unchecked(s, m.start())) s.map(|s| Identifier::new_unchecked(s, m.start()))
}) })
} }
fn accept(&self, contents: &[u8]) -> bool {
if is_number(contents) {
return false;
};
if self.ignore_hex {
return !is_hex(contents);
}
true
}
} }
impl Default for Parser { impl Default for Parser {
@ -103,6 +113,15 @@ impl Default for Parser {
} }
} }
fn is_number(ident: &[u8]) -> bool {
lazy_static::lazy_static! {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
static ref DIGITS: regex::bytes::Regex = regex::bytes::Regex::new(r#"^[0-9_']+$"#).unwrap();
}
DIGITS.is_match(ident)
}
fn is_hex(ident: &[u8]) -> bool { fn is_hex(ident: &[u8]) -> bool {
lazy_static::lazy_static! { lazy_static::lazy_static! {
// `_`: number literal separator in Rust and other languages // `_`: number literal separator in Rust and other languages