diff --git a/benches/file.rs b/benches/file.rs index 6656701..5f69285 100644 --- a/benches/file.rs +++ b/benches/file.rs @@ -18,6 +18,8 @@ fn process_empty(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, + true, false, typos::report::print_silent, ) @@ -38,6 +40,8 @@ fn process_no_tokens(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, + true, false, typos::report::print_silent, ) @@ -58,6 +62,8 @@ fn process_single_token(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, + true, false, typos::report::print_silent, ) @@ -78,6 +84,8 @@ fn process_sherlock(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, + true, false, typos::report::print_silent, ) @@ -98,6 +106,8 @@ fn process_code(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, + true, false, typos::report::print_silent, ) @@ -118,6 +128,8 @@ fn process_corpus(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, + true, false, typos::report::print_silent, ) diff --git a/benches/tokenize.rs b/benches/tokenize.rs index f316581..75b1b9c 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -6,34 +6,34 @@ mod data; #[bench] fn symbol_parse_empty(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse(data::EMPTY.as_bytes()).last()); + b.iter(|| typos::tokens::Identifier::parse_bytes(data::EMPTY.as_bytes()).last()); } #[bench] fn symbol_parse_no_tokens(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse(data::NO_TOKENS.as_bytes()).last()); + b.iter(|| typos::tokens::Identifier::parse_bytes(data::NO_TOKENS.as_bytes()).last()); } #[bench] fn symbol_parse_single_token(b: &mut test::Bencher) { b.iter(|| { - typos::tokens::Identifier::parse(data::SINGLE_TOKEN.as_bytes()).last(); + typos::tokens::Identifier::parse_bytes(data::SINGLE_TOKEN.as_bytes()).last(); }); } #[bench] fn symbol_parse_sherlock(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse(data::SHERLOCK.as_bytes()).last()); + b.iter(|| typos::tokens::Identifier::parse_bytes(data::SHERLOCK.as_bytes()).last()); } #[bench] fn symbol_parse_code(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse(data::CODE.as_bytes()).last()); + b.iter(|| typos::tokens::Identifier::parse_bytes(data::CODE.as_bytes()).last()); } #[bench] fn symbol_parse_corpus(b: &mut test::Bencher) { - b.iter(|| typos::tokens::Identifier::parse(data::CORPUS.as_bytes()).last()); + b.iter(|| typos::tokens::Identifier::parse_bytes(data::CORPUS.as_bytes()).last()); } #[bench] diff --git a/docs/about.md b/docs/about.md index d06ae8a..88340ea 100644 --- a/docs/about.md +++ b/docs/about.md @@ -46,7 +46,7 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh | Whole-project | Yes | Yes | Yes | Yes | No | | Ignores hidden | Yes | Yes | ? | Yes | No | | Respect gitignore | Yes | Yes | ? | No | No | -| Checks filenames | No ([#24][def-24]) | No | ? | Yes | No | +| Checks filenames | Yes | No | ? | Yes | No | | API | Rust / [JSON Lines] | Rust | ? | Python | None | | License | MIT or Apache | AGPL | MIT | GPLv2 | GPLv2 | @@ -59,5 +59,4 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh [def-14]: https://github.com/epage/typos/issues/14 [def-17]: https://github.com/epage/typos/issues/17 [def-18]: https://github.com/epage/typos/issues/18 -[def-24]: https://github.com/epage/typos/issues/24 [def-3]: https://github.com/epage/typos/issues/3 diff --git a/src/lib.rs b/src/lib.rs index f46af98..0463a33 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,48 +17,87 @@ use bstr::ByteSlice; pub fn process_file( path: &std::path::Path, dictionary: &Dictionary, + check_filenames: bool, + check_files: bool, ignore_hex: bool, binary: bool, report: report::Report, ) -> Result<(), failure::Error> { - let mut buffer = Vec::new(); - File::open(path)?.read_to_end(&mut buffer)?; - if !binary && buffer.find_byte(b'\0').is_some() { - return Ok(()); + if check_filenames { + for part in path.components().filter_map(|c| c.as_os_str().to_str()) { + for ident in tokens::Identifier::parse(part) { + if !ignore_hex && is_hex(ident.token()) { + continue; + } + if let Some(correction) = dictionary.correct_ident(ident) { + let msg = report::FilenameCorrection { + path, + typo: ident.token(), + correction, + non_exhaustive: (), + }; + report(msg.into()); + } + for word in ident.split() { + if let Some(correction) = dictionary.correct_word(word) { + let msg = report::FilenameCorrection { + path, + typo: word.token(), + correction, + non_exhaustive: (), + }; + report(msg.into()); + } + } + } + } } - for (line_idx, line) in buffer.lines().enumerate() { - let line_num = line_idx + 1; - for ident in tokens::Identifier::parse(line) { - if !ignore_hex && is_hex(ident.token()) { - continue; - } - if let Some(correction) = dictionary.correct_ident(ident) { - let col_num = ident.offset(); - let msg = report::Message { - path, - line, - line_num, - col_num, - typo: ident.token(), - correction, - non_exhaustive: (), - }; - report(msg); - } - for word in ident.split() { - if let Some(correction) = dictionary.correct_word(word) { - let col_num = word.offset(); - let msg = report::Message { + if check_files { + let mut buffer = Vec::new(); + File::open(path)?.read_to_end(&mut buffer)?; + if !binary && buffer.find_byte(b'\0').is_some() { + let msg = report::BinaryFile { + path, + non_exhaustive: (), + }; + report(msg.into()); + return Ok(()); + } + + for (line_idx, line) in buffer.lines().enumerate() { + let line_num = line_idx + 1; + for ident in tokens::Identifier::parse_bytes(line) { + if !ignore_hex && is_hex(ident.token()) { + continue; + } + if let Some(correction) = dictionary.correct_ident(ident) { + let col_num = ident.offset(); + let msg = report::Correction { path, line, line_num, col_num, - typo: word.token(), + typo: ident.token(), correction, non_exhaustive: (), }; - report(msg); + report(msg.into()); + } + for word in ident.split() { + if let Some(correction) = dictionary.correct_word(word) { + let col_num = word.offset(); + let msg = report::Correction { + path, + line, + line_num, + col_num, + typo: word.token(), + correction, + non_exhaustive: (), + }; + report(msg.into()); + } } } } diff --git a/src/main.rs b/src/main.rs index 38f025a..90decf3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,6 +38,26 @@ struct Options { /// Paths to check path: Vec, + #[structopt(long, raw(overrides_with = r#""check-filenames""#))] + /// Skip verifying spelling in file names. + no_check_filenames: bool, + #[structopt( + long, + raw(overrides_with = r#""no-check-filenames""#), + raw(hidden = "true") + )] + check_filenames: bool, + + #[structopt(long, raw(overrides_with = r#""check-files""#))] + /// Skip verifying spelling in filess. + no_check_files: bool, + #[structopt( + long, + raw(overrides_with = r#""no-check-files""#), + raw(hidden = "true") + )] + check_files: bool, + #[structopt(long, raw(overrides_with = r#""hex""#))] /// Don't try to detect that an identifier looks like hex no_hex: bool, @@ -115,6 +135,24 @@ impl Options { self } + pub fn check_files(&self) -> Option { + match (self.check_files, self.no_check_files) { + (true, false) => Some(true), + (false, true) => Some(false), + (false, false) => None, + (_, _) => unreachable!("StructOpt should make this impossible"), + } + } + + pub fn check_filenames(&self) -> Option { + match (self.check_filenames, self.no_check_filenames) { + (true, false) => Some(true), + (false, true) => Some(false), + (false, false) => None, + (_, _) => unreachable!("StructOpt should make this impossible"), + } + } + pub fn ignore_hex(&self) -> Option { match (self.no_hex, self.hex) { (true, false) => Some(false), @@ -197,6 +235,8 @@ fn run() -> Result<(), failure::Error> { let options = Options::from_args().infer(); let dictionary = typos::Dictionary::new(); + let check_filenames = options.check_filenames().unwrap_or(true); + let check_files = options.check_files().unwrap_or(true); let ignore_hex = options.ignore_hex().unwrap_or(true); let binary = options.binary().unwrap_or(false); @@ -222,6 +262,8 @@ fn run() -> Result<(), failure::Error> { typos::process_file( entry.path(), &dictionary, + check_filenames, + check_files, ignore_hex, binary, options.format.report(), diff --git a/src/report.rs b/src/report.rs index 129755a..23b5c47 100644 --- a/src/report.rs +++ b/src/report.rs @@ -2,7 +2,41 @@ use std::borrow::Cow; use std::io::{self, Write}; #[derive(Clone, Debug, Serialize)] -pub struct Message<'m> { +#[serde(rename_all = "snake_case")] +#[serde(tag = "type")] +pub enum Message<'m> { + BinaryFile(BinaryFile<'m>), + Correction(Correction<'m>), + FilenameCorrection(FilenameCorrection<'m>), +} + +impl<'m> From> for Message<'m> { + fn from(msg: BinaryFile<'m>) -> Self { + Message::BinaryFile(msg) + } +} + +impl<'m> From> for Message<'m> { + fn from(msg: Correction<'m>) -> Self { + Message::Correction(msg) + } +} + +impl<'m> From> for Message<'m> { + fn from(msg: FilenameCorrection<'m>) -> Self { + Message::FilenameCorrection(msg) + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct BinaryFile<'m> { + pub path: &'m std::path::Path, + #[serde(skip)] + pub(crate) non_exhaustive: (), +} + +#[derive(Clone, Debug, Serialize)] +pub struct Correction<'m> { pub path: &'m std::path::Path, #[serde(skip)] pub line: &'m [u8], @@ -14,22 +48,58 @@ pub struct Message<'m> { pub(crate) non_exhaustive: (), } +#[derive(Clone, Debug, Serialize)] +pub struct FilenameCorrection<'m> { + pub path: &'m std::path::Path, + pub typo: &'m str, + pub correction: Cow<'m, str>, + #[serde(skip)] + pub(crate) non_exhaustive: (), +} + pub type Report = fn(msg: Message); pub fn print_silent(_: Message) {} pub fn print_brief(msg: Message) { - println!( - "{}:{}:{}: {} -> {}", - msg.path.display(), - msg.line_num, - msg.col_num, - msg.typo, - msg.correction - ); + match msg { + Message::BinaryFile(msg) => { + println!("Skipping binary file {}", msg.path.display(),); + } + Message::Correction(msg) => { + println!( + "{}:{}:{}: {} -> {}", + msg.path.display(), + msg.line_num, + msg.col_num, + msg.typo, + msg.correction + ); + } + Message::FilenameCorrection(msg) => { + println!("{}: {} -> {}", msg.path.display(), msg.typo, msg.correction); + } + } } pub fn print_long(msg: Message) { + match msg { + Message::BinaryFile(msg) => { + println!("Skipping binary file {}", msg.path.display(),); + } + Message::Correction(msg) => print_long_correction(msg), + Message::FilenameCorrection(msg) => { + println!( + "{}: error: `{}` should be `{}`", + msg.path.display(), + msg.typo, + msg.correction + ); + } + } +} + +fn print_long_correction(msg: Correction) { let line_num = msg.line_num.to_string(); let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect(); diff --git a/src/tokens.rs b/src/tokens.rs index 9d3727a..2d8c09a 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -14,7 +14,7 @@ pub struct Identifier<'t> { impl<'t> Identifier<'t> { pub fn new(token: &'t str, offset: usize) -> Result { - let mut itr = Self::parse(token.as_bytes()); + let mut itr = Self::parse_bytes(token.as_bytes()); let mut item = itr .next() .ok_or_else(|| failure::format_err!("Invalid ident (none found): {:?}", token))?; @@ -38,7 +38,18 @@ impl<'t> Identifier<'t> { Self { token, offset } } - pub fn parse(content: &[u8]) -> impl Iterator> { + pub fn parse(content: &str) -> impl Iterator> { + lazy_static::lazy_static! { + // Getting false positives for this lint + #[allow(clippy::invalid_regex)] + static ref SPLIT: regex::Regex = regex::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap(); + } + SPLIT + .find_iter(content) + .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) + } + + pub fn parse_bytes(content: &[u8]) -> impl Iterator> { lazy_static::lazy_static! { // Getting false positives for this lint #[allow(clippy::invalid_regex)] @@ -240,57 +251,69 @@ mod test { #[test] fn tokenize_empty_is_empty() { - let input = b""; + let input = ""; let expected: Vec = vec![]; + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_word_is_word() { - let input = b"word"; + let input = "word"; let expected: Vec = vec![Identifier::new_unchecked("word", 0)]; + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_space_separated_words() { - let input = b"A B"; + let input = "A B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 2), ]; + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_dot_separated_words() { - let input = b"A.B"; + let input = "A.B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 2), ]; + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_namespace_separated_words() { - let input = b"A::B"; + let input = "A::B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 3), ]; + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_underscore_doesnt_separate() { - let input = b"A_B"; + let input = "A_B"; let expected: Vec = vec![Identifier::new_unchecked("A_B", 0)]; + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); }