use bstr::ByteSlice; use crate::report; use typos::tokens; use typos::Dictionary; pub trait Check: Send + Sync { fn check_file( &self, path: &std::path::Path, explicit: bool, parser: &tokens::Tokenizer, dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error>; } #[derive(Debug, Clone, PartialEq, Eq)] pub struct TyposSettings { check_filenames: bool, check_files: bool, binary: bool, } impl TyposSettings { pub fn new() -> Self { Default::default() } pub fn check_filenames(&mut self, yes: bool) -> &mut Self { self.check_filenames = yes; self } pub fn check_files(&mut self, yes: bool) -> &mut Self { self.check_files = yes; self } pub fn binary(&mut self, yes: bool) -> &mut Self { self.binary = yes; self } pub fn build_typos(&self) -> Typos { Typos { check_filenames: self.check_filenames, check_files: self.check_files, binary: self.binary, } } pub fn build_identifier_parser(&self) -> Identifiers { Identifiers { check_filenames: self.check_filenames, check_files: self.check_files, binary: self.binary, } } pub fn build_word_parser(&self) -> Words { Words { check_filenames: self.check_filenames, check_files: self.check_files, binary: self.binary, } } pub fn build_files(&self) -> FoundFiles { FoundFiles { binary: self.binary, } } } impl Default for TyposSettings { fn default() -> Self { Self { check_filenames: true, check_files: true, binary: false, } } } #[derive(Debug, Clone)] pub struct Typos { check_filenames: bool, check_files: bool, binary: bool, } impl Check for Typos { fn check_file( &self, path: &std::path::Path, explicit: bool, tokenizer: &tokens::Tokenizer, dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { let parser = typos::ParserBuilder::new() .tokenizer(tokenizer) .dictionary(dictionary) .typos(); if self.check_filenames { if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { for typo in parser.parse_str(file_name) { let msg = report::Typo { context: Some(report::PathContext { path }.into()), buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()), byte_offset: typo.byte_offset, typo: typo.typo, corrections: typo.corrections, }; reporter.report(msg.into())?; } } } if self.check_files { let buffer = read_file(path, reporter)?; let (buffer, content_type) = massage_data(buffer)?; if !explicit && !self.binary && content_type.is_binary() { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { let mut accum_line_num = AccumulateLineNum::new(); for typo in parser.parse_bytes(&buffer) { let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); let (line, line_offset) = extract_line(&buffer, typo.byte_offset); let msg = report::Typo { context: Some(report::FileContext { path, line_num }.into()), buffer: std::borrow::Cow::Borrowed(line), byte_offset: line_offset, typo: typo.typo, corrections: typo.corrections, }; reporter.report(msg.into())?; } } } Ok(()) } } #[derive(Debug, Clone)] pub struct Identifiers { check_filenames: bool, check_files: bool, binary: bool, } impl Check for Identifiers { fn check_file( &self, path: &std::path::Path, explicit: bool, tokenizer: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { let parser = typos::ParserBuilder::new() .tokenizer(tokenizer) .identifiers(); if self.check_filenames { if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { for word in parser.parse_str(file_name) { let msg = report::Parse { context: Some(report::PathContext { path }.into()), kind: report::ParseKind::Identifier, data: word.token(), }; reporter.report(msg.into())?; } } } if self.check_files { let buffer = read_file(path, reporter)?; let (buffer, content_type) = massage_data(buffer)?; if !explicit && !self.binary && content_type.is_binary() { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { for word in parser.parse_bytes(&buffer) { // HACK: Don't look up the line_num per entry to better match the performance // of Typos for comparison purposes. We don't really get much out of it // anyway. let line_num = 0; let msg = report::Parse { context: Some(report::FileContext { path, line_num }.into()), kind: report::ParseKind::Identifier, data: word.token(), }; reporter.report(msg.into())?; } } } Ok(()) } } #[derive(Debug, Clone)] pub struct Words { check_filenames: bool, check_files: bool, binary: bool, } impl Check for Words { fn check_file( &self, path: &std::path::Path, explicit: bool, tokenizer: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { let parser = typos::ParserBuilder::new().tokenizer(tokenizer).words(); if self.check_filenames { if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { for word in parser.parse_str(file_name) { let msg = report::Parse { context: Some(report::PathContext { path }.into()), kind: report::ParseKind::Word, data: word.token(), }; reporter.report(msg.into())?; } } } if self.check_files { let buffer = read_file(path, reporter)?; let (buffer, content_type) = massage_data(buffer)?; if !explicit && !self.binary && content_type.is_binary() { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { for word in parser.parse_bytes(&buffer) { // HACK: Don't look up the line_num per entry to better match the performance // of Typos for comparison purposes. We don't really get much out of it // anyway. let line_num = 0; let msg = report::Parse { context: Some(report::FileContext { path, line_num }.into()), kind: report::ParseKind::Word, data: word.token(), }; reporter.report(msg.into())?; } } } Ok(()) } } #[derive(Debug, Clone)] pub struct FoundFiles { binary: bool, } impl Check for FoundFiles { fn check_file( &self, path: &std::path::Path, explicit: bool, _parser: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { // Check `self.binary` first so we can easily check performance of walking vs reading if self.binary { let msg = report::File::new(path); reporter.report(msg.into())?; } else { let buffer = read_file(path, reporter)?; let (_buffer, content_type) = massage_data(buffer)?; if !explicit && content_type.is_binary() { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { let msg = report::File::new(path); reporter.report(msg.into())?; } } Ok(()) } } fn read_file( path: &std::path::Path, reporter: &dyn report::Report, ) -> Result, std::io::Error> { let buffer = match std::fs::read(path) { Ok(buffer) => buffer, Err(err) => { let msg = report::Error::new(err.to_string()); reporter.report(msg.into())?; Vec::new() } }; Ok(buffer) } fn massage_data( buffer: Vec, ) -> Result<(Vec, content_inspector::ContentType), std::io::Error> { let mut content_type = content_inspector::inspect(&buffer); // HACK: We only support UTF-8 at the moment if content_type != content_inspector::ContentType::UTF_8_BOM && content_type != content_inspector::ContentType::UTF_8 { content_type = content_inspector::ContentType::BINARY; } Ok((buffer, content_type)) } struct AccumulateLineNum { line_num: usize, last_offset: usize, } impl AccumulateLineNum { fn new() -> Self { Self { // 1-indexed line_num: 1, last_offset: 0, } } fn line_num(&mut self, buffer: &[u8], byte_offset: usize) -> usize { assert!(self.last_offset <= byte_offset); let slice = &buffer[self.last_offset..byte_offset]; let newlines = slice.lines().count(); let line_num = self.line_num + newlines; self.line_num = line_num; self.last_offset = byte_offset; line_num } } fn extract_line(buffer: &[u8], byte_offset: usize) -> (&[u8], usize) { let line_start = buffer[0..byte_offset] .rfind_byte(b'\n') // Skip the newline .map(|s| s + 1) .unwrap_or(0); let line = buffer[line_start..] .lines() .next() .expect("should always be at least a line"); let line_offset = byte_offset - line_start; (line, line_offset) } pub fn check_path( walk: ignore::Walk, checks: &dyn Check, parser: &typos::tokens::Tokenizer, dictionary: &dyn typos::Dictionary, reporter: &dyn report::Report, ) -> Result<(), ignore::Error> { for entry in walk { check_entry(entry, checks, parser, dictionary, reporter)?; } Ok(()) } pub fn check_path_parallel( walk: ignore::WalkParallel, checks: &dyn Check, parser: &typos::tokens::Tokenizer, dictionary: &dyn typos::Dictionary, reporter: &dyn report::Report, ) -> Result<(), ignore::Error> { let error: std::sync::Mutex> = std::sync::Mutex::new(Ok(())); walk.run(|| { Box::new(|entry: Result| { match check_entry(entry, checks, parser, dictionary, reporter) { Ok(()) => ignore::WalkState::Continue, Err(err) => { *error.lock().unwrap() = Err(err); ignore::WalkState::Quit } } }) }); error.into_inner().unwrap() } fn check_entry( entry: Result, checks: &dyn Check, parser: &typos::tokens::Tokenizer, dictionary: &dyn typos::Dictionary, reporter: &dyn report::Report, ) -> Result<(), ignore::Error> { let entry = entry?; if entry.file_type().map(|t| t.is_file()).unwrap_or(true) { let explicit = entry.depth() == 0; checks.check_file(entry.path(), explicit, parser, dictionary, reporter)?; } Ok(()) }