fix: Improve the quality of symbols being reported

This commit is contained in:
Ed Page 2019-06-14 15:57:41 -06:00
parent c7ca904401
commit d78713dba1
2 changed files with 25 additions and 26 deletions

View file

@ -21,17 +21,16 @@ pub fn process_file(
File::open(path)?.read_to_end(&mut buffer)?; File::open(path)?.read_to_end(&mut buffer)?;
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
let line_num = line_idx + 1; let line_num = line_idx + 1;
for token in tokens::Symbol::parse(line) { for symbol in tokens::Symbol::parse(line) {
if let Ok(word) = std::str::from_utf8(token.token) {
// Correct tokens as-is // Correct tokens as-is
if let Some(correction) = dictionary.correct_str(word) { if let Some(correction) = dictionary.correct_str(symbol.token) {
let col_num = token.offset; let col_num = symbol.offset;
let msg = report::Message { let msg = report::Message {
path, path,
line, line,
line_num, line_num,
col_num, col_num,
word, word: symbol.token,
correction, correction,
non_exhaustive: (), non_exhaustive: (),
}; };
@ -39,7 +38,6 @@ pub fn process_file(
} }
} }
} }
}
Ok(()) Ok(())
} }

View file

@ -1,11 +1,11 @@
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct Symbol<'t> { pub struct Symbol<'t> {
pub token: &'t [u8], pub token: &'t str,
pub offset: usize, pub offset: usize,
} }
impl<'t> Symbol<'t> { impl<'t> Symbol<'t> {
pub fn new(token: &'t [u8], offset: usize) -> Self { pub fn new(token: &'t str, offset: usize) -> Self {
Self { token, offset } Self { token, offset }
} }
@ -15,9 +15,10 @@ impl<'t> Symbol<'t> {
#[allow(clippy::invalid_regex)] #[allow(clippy::invalid_regex)]
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
} }
SPLIT SPLIT.find_iter(content).filter_map(|m| {
.find_iter(content) let s = std::str::from_utf8(m.as_bytes()).ok();
.map(|m| Symbol::new(m.as_bytes(), m.start())) s.map(|s| Symbol::new(s, m.start()))
})
} }
} }
@ -36,7 +37,7 @@ mod test {
#[test] #[test]
fn tokenize_word_is_word() { fn tokenize_word_is_word() {
let input = b"word"; let input = b"word";
let expected: Vec<Symbol> = vec![Symbol::new(b"word", 0)]; let expected: Vec<Symbol> = vec![Symbol::new("word", 0)];
let actual: Vec<_> = Symbol::parse(input).collect(); let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
@ -44,7 +45,7 @@ mod test {
#[test] #[test]
fn tokenize_space_separated_words() { fn tokenize_space_separated_words() {
let input = b"A B"; let input = b"A B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)]; let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect(); let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
@ -52,7 +53,7 @@ mod test {
#[test] #[test]
fn tokenize_dot_separated_words() { fn tokenize_dot_separated_words() {
let input = b"A.B"; let input = b"A.B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)]; let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect(); let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
@ -60,7 +61,7 @@ mod test {
#[test] #[test]
fn tokenize_namespace_separated_words() { fn tokenize_namespace_separated_words() {
let input = b"A::B"; let input = b"A::B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)]; let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 3)];
let actual: Vec<_> = Symbol::parse(input).collect(); let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
@ -68,7 +69,7 @@ mod test {
#[test] #[test]
fn tokenize_underscore_doesnt_separate() { fn tokenize_underscore_doesnt_separate() {
let input = b"A_B"; let input = b"A_B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A_B", 0)]; let expected: Vec<Symbol> = vec![Symbol::new("A_B", 0)];
let actual: Vec<_> = Symbol::parse(input).collect(); let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }