typos/src/checks.rs

419 lines
12 KiB
Rust
Raw Normal View History

2020-12-30 20:41:08 -05:00
use bstr::ByteSlice;
use crate::report;
use typos::tokens;
use typos::Dictionary;
pub trait Check: Send + Sync {
2020-12-30 22:17:28 -05:00
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
parser: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
2020-12-31 18:41:32 -05:00
) -> Result<(), std::io::Error>;
2020-12-30 20:41:08 -05:00
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TyposSettings {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl TyposSettings {
pub fn new() -> Self {
Default::default()
}
pub fn check_filenames(&mut self, yes: bool) -> &mut Self {
self.check_filenames = yes;
self
}
pub fn check_files(&mut self, yes: bool) -> &mut Self {
self.check_files = yes;
self
}
pub fn binary(&mut self, yes: bool) -> &mut Self {
self.binary = yes;
self
}
pub fn build_typos(&self) -> Typos {
Typos {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_identifier_parser(&self) -> Identifiers {
Identifiers {
2020-12-30 20:41:08 -05:00
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_word_parser(&self) -> Words {
Words {
2020-12-30 20:41:08 -05:00
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_files(&self) -> FoundFiles {
FoundFiles {
binary: self.binary,
}
2020-12-30 20:41:08 -05:00
}
}
impl Default for TyposSettings {
fn default() -> Self {
Self {
check_filenames: true,
check_files: true,
binary: false,
}
}
}
#[derive(Debug, Clone)]
pub struct Typos {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl Check for Typos {
2020-12-31 18:41:32 -05:00
fn check_file(
2020-12-30 20:41:08 -05:00
&self,
2020-12-31 18:41:32 -05:00
path: &std::path::Path,
explicit: bool,
2020-12-30 20:41:08 -05:00
tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let parser = typos::ParserBuilder::new()
.tokenizer(tokenizer)
.dictionary(dictionary)
.typos();
2020-12-31 18:41:32 -05:00
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
for typo in parser.parse_str(file_name) {
let msg = report::Typo {
context: Some(report::PathContext { path }.into()),
buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()),
byte_offset: typo.byte_offset,
typo: typo.typo.as_ref(),
2020-12-31 18:41:32 -05:00
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
2020-12-30 20:41:08 -05:00
}
2020-12-31 18:41:32 -05:00
if self.check_files {
let buffer = read_file(path, reporter)?;
let (buffer, content_type) = massage_data(buffer)?;
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
let mut accum_line_num = AccumulateLineNum::new();
for typo in parser.parse_bytes(&buffer) {
let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
let msg = report::Typo {
context: Some(report::FileContext { path, line_num }.into()),
buffer: std::borrow::Cow::Borrowed(line),
byte_offset: line_offset,
typo: typo.typo.as_ref(),
2020-12-31 18:41:32 -05:00
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
}
2020-12-30 20:41:08 -05:00
2020-12-31 18:41:32 -05:00
Ok(())
2020-12-30 20:41:08 -05:00
}
}
#[derive(Debug, Clone)]
pub struct Identifiers {
2020-12-30 20:41:08 -05:00
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl Check for Identifiers {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
2020-12-30 20:41:08 -05:00
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let parser = typos::ParserBuilder::new()
.tokenizer(tokenizer)
.identifiers();
2020-12-31 18:41:32 -05:00
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
for word in parser.parse_str(file_name) {
let msg = report::Parse {
context: Some(report::PathContext { path }.into()),
kind: report::ParseKind::Identifier,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
}
2020-12-31 18:41:32 -05:00
if self.check_files {
let buffer = read_file(path, reporter)?;
let (buffer, content_type) = massage_data(buffer)?;
2020-12-31 18:41:32 -05:00
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
for word in parser.parse_bytes(&buffer) {
2020-12-31 18:41:32 -05:00
// HACK: Don't look up the line_num per entry to better match the performance
// of Typos for comparison purposes. We don't really get much out of it
// anyway.
let line_num = 0;
let msg = report::Parse {
2020-12-31 18:41:32 -05:00
context: Some(report::FileContext { path, line_num }.into()),
kind: report::ParseKind::Identifier,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
2020-12-30 20:41:08 -05:00
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct Words {
2020-12-30 20:41:08 -05:00
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl Check for Words {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
2020-12-30 20:41:08 -05:00
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let parser = typos::ParserBuilder::new().tokenizer(tokenizer).words();
2020-12-31 18:41:32 -05:00
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
for word in parser.parse_str(file_name) {
let msg = report::Parse {
context: Some(report::PathContext { path }.into()),
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
}
2020-12-31 18:41:32 -05:00
if self.check_files {
let buffer = read_file(path, reporter)?;
let (buffer, content_type) = massage_data(buffer)?;
2020-12-31 18:41:32 -05:00
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
for word in parser.parse_bytes(&buffer) {
2020-12-31 18:41:32 -05:00
// HACK: Don't look up the line_num per entry to better match the performance
// of Typos for comparison purposes. We don't really get much out of it
// anyway.
let line_num = 0;
let msg = report::Parse {
2020-12-31 18:41:32 -05:00
context: Some(report::FileContext { path, line_num }.into()),
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
2020-12-30 20:41:08 -05:00
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct FoundFiles {
binary: bool,
}
2020-12-30 20:41:08 -05:00
impl Check for FoundFiles {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
_parser: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
2020-12-30 20:41:08 -05:00
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
// Check `self.binary` first so we can easily check performance of walking vs reading
if self.binary {
let msg = report::File::new(path);
reporter.report(msg.into())?;
} else {
let buffer = read_file(path, reporter)?;
let (_buffer, content_type) = massage_data(buffer)?;
if !explicit && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
let msg = report::File::new(path);
reporter.report(msg.into())?;
}
}
2020-12-30 20:41:08 -05:00
Ok(())
}
}
fn read_file(
path: &std::path::Path,
reporter: &dyn report::Report,
) -> Result<Vec<u8>, std::io::Error> {
let buffer = match std::fs::read(path) {
Ok(buffer) => buffer,
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
Vec::new()
}
};
Ok(buffer)
}
fn massage_data(
buffer: Vec<u8>,
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
let mut content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8
{
content_type = content_inspector::ContentType::BINARY;
}
Ok((buffer, content_type))
}
2020-12-31 18:41:32 -05:00
struct AccumulateLineNum {
line_num: usize,
last_offset: usize,
}
impl AccumulateLineNum {
fn new() -> Self {
Self {
// 1-indexed
line_num: 1,
last_offset: 0,
}
}
fn line_num(&mut self, buffer: &[u8], byte_offset: usize) -> usize {
assert!(self.last_offset <= byte_offset);
let slice = &buffer[self.last_offset..byte_offset];
let newlines = slice.lines().count();
let line_num = self.line_num + newlines;
self.line_num = line_num;
self.last_offset = byte_offset;
line_num
}
}
fn extract_line(buffer: &[u8], byte_offset: usize) -> (&[u8], usize) {
let line_start = buffer[0..byte_offset]
.rfind_byte(b'\n')
// Skip the newline
.map(|s| s + 1)
.unwrap_or(0);
let line = buffer[line_start..]
.lines()
.next()
.expect("should always be at least a line");
let line_offset = byte_offset - line_start;
(line, line_offset)
}
2020-12-30 20:41:08 -05:00
pub fn check_path(
2020-03-23 21:37:06 -04:00
walk: ignore::Walk,
2020-12-30 20:41:08 -05:00
checks: &dyn Check,
parser: &typos::tokens::Tokenizer,
2020-03-23 21:37:06 -04:00
dictionary: &dyn typos::Dictionary,
2020-12-30 20:41:08 -05:00
reporter: &dyn report::Report,
2020-11-23 11:08:38 -05:00
) -> Result<(), ignore::Error> {
2020-03-23 21:37:06 -04:00
for entry in walk {
2020-11-16 21:02:10 -05:00
check_entry(entry, checks, parser, dictionary, reporter)?;
2020-03-23 21:37:06 -04:00
}
2020-11-16 21:02:10 -05:00
Ok(())
2020-03-23 21:37:06 -04:00
}
2020-12-30 20:41:08 -05:00
pub fn check_path_parallel(
2020-03-23 21:37:06 -04:00
walk: ignore::WalkParallel,
2020-12-30 20:41:08 -05:00
checks: &dyn Check,
parser: &typos::tokens::Tokenizer,
2020-03-23 21:37:06 -04:00
dictionary: &dyn typos::Dictionary,
2020-12-30 20:41:08 -05:00
reporter: &dyn report::Report,
2020-11-23 11:08:38 -05:00
) -> Result<(), ignore::Error> {
let error: std::sync::Mutex<Result<(), ignore::Error>> = std::sync::Mutex::new(Ok(()));
2020-03-23 21:37:06 -04:00
walk.run(|| {
Box::new(|entry: Result<ignore::DirEntry, ignore::Error>| {
match check_entry(entry, checks, parser, dictionary, reporter) {
2020-11-16 21:02:10 -05:00
Ok(()) => ignore::WalkState::Continue,
2020-03-23 21:37:06 -04:00
Err(err) => {
2020-11-16 21:02:10 -05:00
*error.lock().unwrap() = Err(err);
ignore::WalkState::Quit
2020-03-23 21:37:06 -04:00
}
}
})
});
2020-11-16 21:02:10 -05:00
error.into_inner().unwrap()
2020-03-23 21:37:06 -04:00
}
fn check_entry(
entry: Result<ignore::DirEntry, ignore::Error>,
2020-12-30 20:41:08 -05:00
checks: &dyn Check,
parser: &typos::tokens::Tokenizer,
2020-03-23 21:37:06 -04:00
dictionary: &dyn typos::Dictionary,
2020-12-30 20:41:08 -05:00
reporter: &dyn report::Report,
2020-11-23 11:08:38 -05:00
) -> Result<(), ignore::Error> {
2020-03-23 21:37:06 -04:00
let entry = entry?;
if entry.file_type().map(|t| t.is_file()).unwrap_or(true) {
let explicit = entry.depth() == 0;
2020-12-30 22:17:28 -05:00
checks.check_file(entry.path(), explicit, parser, dictionary, reporter)?;
2020-03-23 21:37:06 -04:00
}
2020-11-16 21:02:10 -05:00
Ok(())
2020-03-23 21:37:06 -04:00
}