mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-24 18:10:56 -05:00
fix: Improve the quality of symbols being reported
This commit is contained in:
parent
c7ca904401
commit
d78713dba1
2 changed files with 25 additions and 26 deletions
30
src/lib.rs
30
src/lib.rs
|
@ -21,22 +21,20 @@ pub fn process_file(
|
||||||
File::open(path)?.read_to_end(&mut buffer)?;
|
File::open(path)?.read_to_end(&mut buffer)?;
|
||||||
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
|
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
|
||||||
let line_num = line_idx + 1;
|
let line_num = line_idx + 1;
|
||||||
for token in tokens::Symbol::parse(line) {
|
for symbol in tokens::Symbol::parse(line) {
|
||||||
if let Ok(word) = std::str::from_utf8(token.token) {
|
// Correct tokens as-is
|
||||||
// Correct tokens as-is
|
if let Some(correction) = dictionary.correct_str(symbol.token) {
|
||||||
if let Some(correction) = dictionary.correct_str(word) {
|
let col_num = symbol.offset;
|
||||||
let col_num = token.offset;
|
let msg = report::Message {
|
||||||
let msg = report::Message {
|
path,
|
||||||
path,
|
line,
|
||||||
line,
|
line_num,
|
||||||
line_num,
|
col_num,
|
||||||
col_num,
|
word: symbol.token,
|
||||||
word,
|
correction,
|
||||||
correction,
|
non_exhaustive: (),
|
||||||
non_exhaustive: (),
|
};
|
||||||
};
|
report(msg);
|
||||||
report(msg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct Symbol<'t> {
|
pub struct Symbol<'t> {
|
||||||
pub token: &'t [u8],
|
pub token: &'t str,
|
||||||
pub offset: usize,
|
pub offset: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Symbol<'t> {
|
impl<'t> Symbol<'t> {
|
||||||
pub fn new(token: &'t [u8], offset: usize) -> Self {
|
pub fn new(token: &'t str, offset: usize) -> Self {
|
||||||
Self { token, offset }
|
Self { token, offset }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,9 +15,10 @@ impl<'t> Symbol<'t> {
|
||||||
#[allow(clippy::invalid_regex)]
|
#[allow(clippy::invalid_regex)]
|
||||||
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
|
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
|
||||||
}
|
}
|
||||||
SPLIT
|
SPLIT.find_iter(content).filter_map(|m| {
|
||||||
.find_iter(content)
|
let s = std::str::from_utf8(m.as_bytes()).ok();
|
||||||
.map(|m| Symbol::new(m.as_bytes(), m.start()))
|
s.map(|s| Symbol::new(s, m.start()))
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36,7 +37,7 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_word_is_word() {
|
fn tokenize_word_is_word() {
|
||||||
let input = b"word";
|
let input = b"word";
|
||||||
let expected: Vec<Symbol> = vec![Symbol::new(b"word", 0)];
|
let expected: Vec<Symbol> = vec![Symbol::new("word", 0)];
|
||||||
let actual: Vec<_> = Symbol::parse(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
@ -44,7 +45,7 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_space_separated_words() {
|
fn tokenize_space_separated_words() {
|
||||||
let input = b"A B";
|
let input = b"A B";
|
||||||
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
|
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
|
||||||
let actual: Vec<_> = Symbol::parse(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
@ -52,7 +53,7 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_dot_separated_words() {
|
fn tokenize_dot_separated_words() {
|
||||||
let input = b"A.B";
|
let input = b"A.B";
|
||||||
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
|
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
|
||||||
let actual: Vec<_> = Symbol::parse(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
@ -60,7 +61,7 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_namespace_separated_words() {
|
fn tokenize_namespace_separated_words() {
|
||||||
let input = b"A::B";
|
let input = b"A::B";
|
||||||
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)];
|
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 3)];
|
||||||
let actual: Vec<_> = Symbol::parse(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
@ -68,7 +69,7 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_underscore_doesnt_separate() {
|
fn tokenize_underscore_doesnt_separate() {
|
||||||
let input = b"A_B";
|
let input = b"A_B";
|
||||||
let expected: Vec<Symbol> = vec![Symbol::new(b"A_B", 0)];
|
let expected: Vec<Symbol> = vec![Symbol::new("A_B", 0)];
|
||||||
let actual: Vec<_> = Symbol::parse(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue