diff --git a/src/lib.rs b/src/lib.rs index 3d91820..6226f80 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,22 +21,20 @@ pub fn process_file( File::open(path)?.read_to_end(&mut buffer)?; for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { let line_num = line_idx + 1; - for token in tokens::Symbol::parse(line) { - if let Ok(word) = std::str::from_utf8(token.token) { - // Correct tokens as-is - if let Some(correction) = dictionary.correct_str(word) { - let col_num = token.offset; - let msg = report::Message { - path, - line, - line_num, - col_num, - word, - correction, - non_exhaustive: (), - }; - report(msg); - } + for symbol in tokens::Symbol::parse(line) { + // Correct tokens as-is + if let Some(correction) = dictionary.correct_str(symbol.token) { + let col_num = symbol.offset; + let msg = report::Message { + path, + line, + line_num, + col_num, + word: symbol.token, + correction, + non_exhaustive: (), + }; + report(msg); } } } diff --git a/src/tokens.rs b/src/tokens.rs index e3621f0..dedff69 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -1,11 +1,11 @@ #[derive(Debug, Clone, PartialEq, Eq)] pub struct Symbol<'t> { - pub token: &'t [u8], + pub token: &'t str, pub offset: usize, } impl<'t> Symbol<'t> { - pub fn new(token: &'t [u8], offset: usize) -> Self { + pub fn new(token: &'t str, offset: usize) -> Self { Self { token, offset } } @@ -15,9 +15,10 @@ impl<'t> Symbol<'t> { #[allow(clippy::invalid_regex)] static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); } - SPLIT - .find_iter(content) - .map(|m| Symbol::new(m.as_bytes(), m.start())) + SPLIT.find_iter(content).filter_map(|m| { + let s = std::str::from_utf8(m.as_bytes()).ok(); + s.map(|s| Symbol::new(s, m.start())) + }) } } @@ -36,7 +37,7 @@ mod test { #[test] fn tokenize_word_is_word() { let input = b"word"; - let expected: Vec = vec![Symbol::new(b"word", 0)]; + let expected: Vec = vec![Symbol::new("word", 0)]; let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } @@ -44,7 +45,7 @@ mod test { #[test] fn tokenize_space_separated_words() { let input = b"A B"; - let expected: Vec = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)]; + let expected: Vec = vec![Symbol::new("A", 0), Symbol::new("B", 2)]; let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } @@ -52,7 +53,7 @@ mod test { #[test] fn tokenize_dot_separated_words() { let input = b"A.B"; - let expected: Vec = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)]; + let expected: Vec = vec![Symbol::new("A", 0), Symbol::new("B", 2)]; let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } @@ -60,7 +61,7 @@ mod test { #[test] fn tokenize_namespace_separated_words() { let input = b"A::B"; - let expected: Vec = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)]; + let expected: Vec = vec![Symbol::new("A", 0), Symbol::new("B", 3)]; let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } @@ -68,7 +69,7 @@ mod test { #[test] fn tokenize_underscore_doesnt_separate() { let input = b"A_B"; - let expected: Vec = vec![Symbol::new(b"A_B", 0)]; + let expected: Vec = vec![Symbol::new("A_B", 0)]; let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); }