fix: Improve the quality of symbols being reported

This commit is contained in:
Ed Page 2019-06-14 15:57:41 -06:00
parent c7ca904401
commit d78713dba1
2 changed files with 25 additions and 26 deletions

View file

@ -21,22 +21,20 @@ pub fn process_file(
File::open(path)?.read_to_end(&mut buffer)?;
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
let line_num = line_idx + 1;
for token in tokens::Symbol::parse(line) {
if let Ok(word) = std::str::from_utf8(token.token) {
// Correct tokens as-is
if let Some(correction) = dictionary.correct_str(word) {
let col_num = token.offset;
let msg = report::Message {
path,
line,
line_num,
col_num,
word,
correction,
non_exhaustive: (),
};
report(msg);
}
for symbol in tokens::Symbol::parse(line) {
// Correct tokens as-is
if let Some(correction) = dictionary.correct_str(symbol.token) {
let col_num = symbol.offset;
let msg = report::Message {
path,
line,
line_num,
col_num,
word: symbol.token,
correction,
non_exhaustive: (),
};
report(msg);
}
}
}

View file

@ -1,11 +1,11 @@
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Symbol<'t> {
pub token: &'t [u8],
pub token: &'t str,
pub offset: usize,
}
impl<'t> Symbol<'t> {
pub fn new(token: &'t [u8], offset: usize) -> Self {
pub fn new(token: &'t str, offset: usize) -> Self {
Self { token, offset }
}
@ -15,9 +15,10 @@ impl<'t> Symbol<'t> {
#[allow(clippy::invalid_regex)]
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
}
SPLIT
.find_iter(content)
.map(|m| Symbol::new(m.as_bytes(), m.start()))
SPLIT.find_iter(content).filter_map(|m| {
let s = std::str::from_utf8(m.as_bytes()).ok();
s.map(|s| Symbol::new(s, m.start()))
})
}
}
@ -36,7 +37,7 @@ mod test {
#[test]
fn tokenize_word_is_word() {
let input = b"word";
let expected: Vec<Symbol> = vec![Symbol::new(b"word", 0)];
let expected: Vec<Symbol> = vec![Symbol::new("word", 0)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}
@ -44,7 +45,7 @@ mod test {
#[test]
fn tokenize_space_separated_words() {
let input = b"A B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}
@ -52,7 +53,7 @@ mod test {
#[test]
fn tokenize_dot_separated_words() {
let input = b"A.B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}
@ -60,7 +61,7 @@ mod test {
#[test]
fn tokenize_namespace_separated_words() {
let input = b"A::B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)];
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 3)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}
@ -68,7 +69,7 @@ mod test {
#[test]
fn tokenize_underscore_doesnt_separate() {
let input = b"A_B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A_B", 0)];
let expected: Vec<Symbol> = vec![Symbol::new("A_B", 0)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}