refactor(parser): Move hex handling to parser

This commit is contained in:
Ed Page 2019-07-24 06:47:50 -06:00
parent d0b9979c36
commit 3cf9d8672c
2 changed files with 61 additions and 21 deletions

View file

@ -23,15 +23,12 @@ pub fn process_file(
binary: bool, binary: bool,
report: report::Report, report: report::Report,
) -> Result<bool, failure::Error> { ) -> Result<bool, failure::Error> {
let parser = tokens::Parser::new(); let parser = tokens::ParserBuilder::new().ignore_hex(ignore_hex).build();
let mut typos_found = false; let mut typos_found = false;
if check_filenames { if check_filenames {
for part in path.components().filter_map(|c| c.as_os_str().to_str()) { for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
for ident in parser.parse(part) { for ident in parser.parse(part) {
if !ignore_hex && is_hex(ident.token()) {
continue;
}
if let Some(correction) = dictionary.correct_ident(ident) { if let Some(correction) = dictionary.correct_ident(ident) {
let msg = report::FilenameCorrection { let msg = report::FilenameCorrection {
path, path,
@ -73,9 +70,6 @@ pub fn process_file(
for (line_idx, line) in buffer.lines().enumerate() { for (line_idx, line) in buffer.lines().enumerate() {
let line_num = line_idx + 1; let line_num = line_idx + 1;
for ident in parser.parse_bytes(line) { for ident in parser.parse_bytes(line) {
if !ignore_hex && is_hex(ident.token()) {
continue;
}
if let Some(correction) = dictionary.correct_ident(ident) { if let Some(correction) = dictionary.correct_ident(ident) {
let col_num = ident.offset(); let col_num = ident.offset();
let msg = report::Correction { let msg = report::Correction {
@ -112,12 +106,3 @@ pub fn process_file(
Ok(typos_found) Ok(typos_found)
} }
fn is_hex(ident: &str) -> bool {
lazy_static::lazy_static! {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
}
HEX.is_match(ident)
}

View file

@ -7,13 +7,20 @@ pub enum Case {
} }
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
pub struct ParserBuilder {} pub struct ParserBuilder {
ignore_hex: bool,
}
impl ParserBuilder { impl ParserBuilder {
pub fn new() -> Self { pub fn new() -> Self {
Default::default() Default::default()
} }
pub fn ignore_hex(mut self, yes: bool) -> Self {
self.ignore_hex = yes;
self
}
pub fn build(self) -> Parser { pub fn build(self) -> Parser {
let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#; let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#;
let words_str = regex::Regex::new(pattern).unwrap(); let words_str = regex::Regex::new(pattern).unwrap();
@ -21,6 +28,7 @@ impl ParserBuilder {
Parser { Parser {
words_str, words_str,
words_bytes, words_bytes,
ignore_hex: self.ignore_hex,
} }
} }
} }
@ -29,6 +37,7 @@ impl ParserBuilder {
pub struct Parser { pub struct Parser {
words_str: regex::Regex, words_str: regex::Regex,
words_bytes: regex::bytes::Regex, words_bytes: regex::bytes::Regex,
ignore_hex: bool,
} }
impl Parser { impl Parser {
@ -37,13 +46,19 @@ impl Parser {
} }
pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> { pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
let ignore_hex = self.ignore_hex;
self.words_str self.words_str
.find_iter(content) .find_iter(content)
.filter(move |m| !ignore_hex || !is_hex(m.as_str().as_bytes()))
.map(|m| Identifier::new_unchecked(m.as_str(), m.start())) .map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
} }
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
self.words_bytes.find_iter(content).filter_map(|m| { let ignore_hex = self.ignore_hex;
self.words_bytes
.find_iter(content)
.filter(move |m| !ignore_hex || !is_hex(m.as_bytes()))
.filter_map(|m| {
let s = std::str::from_utf8(m.as_bytes()).ok(); let s = std::str::from_utf8(m.as_bytes()).ok();
s.map(|s| Identifier::new_unchecked(s, m.start())) s.map(|s| Identifier::new_unchecked(s, m.start()))
}) })
@ -56,6 +71,15 @@ impl Default for Parser {
} }
} }
fn is_hex(ident: &[u8]) -> bool {
lazy_static::lazy_static! {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
static ref HEX: regex::bytes::Regex = regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
}
HEX.is_match(ident)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Identifier<'t> { pub struct Identifier<'t> {
token: &'t str, token: &'t str,
@ -335,6 +359,37 @@ mod test {
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test]
fn tokenize_ignore_hex_enabled() {
let parser = ParserBuilder::new().ignore_hex(true).build();
let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", 0),
Identifier::new_unchecked("World", 17),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_ignore_hex_disabled() {
let parser = ParserBuilder::new().ignore_hex(false).build();
let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", 0),
Identifier::new_unchecked("0xDEADBEEF", 6),
Identifier::new_unchecked("World", 17),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse(input).collect();
assert_eq!(expected, actual);
}
#[test] #[test]
fn split_ident() { fn split_ident() {
let cases = [ let cases = [