From ec307dffddf39d46428c8bdfc0af1f1e45dd06ef Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 18 Jul 2019 20:20:45 -0600 Subject: [PATCH] feat: Check file names Fixes #24 --- benches/file.rs | 6 ++++++ docs/about.md | 3 +-- src/lib.rs | 31 +++++++++++++++++++++++++++++++ src/main.rs | 21 +++++++++++++++++++++ src/report.rs | 27 +++++++++++++++++++++++++++ src/tokens.rs | 47 +++++++++++++++++++++++++++++++++++------------ 6 files changed, 121 insertions(+), 14 deletions(-) diff --git a/benches/file.rs b/benches/file.rs index 6656701..b937547 100644 --- a/benches/file.rs +++ b/benches/file.rs @@ -18,6 +18,7 @@ fn process_empty(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, false, typos::report::print_silent, ) @@ -38,6 +39,7 @@ fn process_no_tokens(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, false, typos::report::print_silent, ) @@ -58,6 +60,7 @@ fn process_single_token(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, false, typos::report::print_silent, ) @@ -78,6 +81,7 @@ fn process_sherlock(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, false, typos::report::print_silent, ) @@ -98,6 +102,7 @@ fn process_code(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, false, typos::report::print_silent, ) @@ -118,6 +123,7 @@ fn process_corpus(b: &mut test::Bencher) { sample_path.path(), &corrections, true, + true, false, typos::report::print_silent, ) diff --git a/docs/about.md b/docs/about.md index d06ae8a..88340ea 100644 --- a/docs/about.md +++ b/docs/about.md @@ -46,7 +46,7 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh | Whole-project | Yes | Yes | Yes | Yes | No | | Ignores hidden | Yes | Yes | ? | Yes | No | | Respect gitignore | Yes | Yes | ? | No | No | -| Checks filenames | No ([#24][def-24]) | No | ? | Yes | No | +| Checks filenames | Yes | No | ? | Yes | No | | API | Rust / [JSON Lines] | Rust | ? | Python | None | | License | MIT or Apache | AGPL | MIT | GPLv2 | GPLv2 | @@ -59,5 +59,4 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh [def-14]: https://github.com/epage/typos/issues/14 [def-17]: https://github.com/epage/typos/issues/17 [def-18]: https://github.com/epage/typos/issues/18 -[def-24]: https://github.com/epage/typos/issues/24 [def-3]: https://github.com/epage/typos/issues/3 diff --git a/src/lib.rs b/src/lib.rs index f2d9b99..8121bfb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,10 +17,41 @@ use bstr::ByteSlice; pub fn process_file( path: &std::path::Path, dictionary: &Dictionary, + check_filenames: bool, ignore_hex: bool, binary: bool, report: report::Report, ) -> Result<(), failure::Error> { + if check_filenames { + for part in path.components().filter_map(|c| c.as_os_str().to_str()) { + for ident in tokens::Identifier::parse(part) { + if !ignore_hex && is_hex(ident.token()) { + continue; + } + if let Some(correction) = dictionary.correct_ident(ident) { + let msg = report::FilenameCorrection { + path, + typo: ident.token(), + correction, + non_exhaustive: (), + }; + report(msg.into()); + } + for word in ident.split() { + if let Some(correction) = dictionary.correct_word(word) { + let msg = report::FilenameCorrection { + path, + typo: word.token(), + correction, + non_exhaustive: (), + }; + report(msg.into()); + } + } + } + } + } + let mut buffer = Vec::new(); File::open(path)?.read_to_end(&mut buffer)?; if !binary && buffer.find_byte(b'\0').is_some() { diff --git a/src/main.rs b/src/main.rs index 38f025a..2d999e4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,6 +38,16 @@ struct Options { /// Paths to check path: Vec, + #[structopt(long, raw(overrides_with = r#""check-filenames""#))] + /// Skip verifying spelling in file names. + no_check_filenames: bool, + #[structopt( + long, + raw(overrides_with = r#""no-check-filenames""#), + raw(hidden = "true") + )] + check_filenames: bool, + #[structopt(long, raw(overrides_with = r#""hex""#))] /// Don't try to detect that an identifier looks like hex no_hex: bool, @@ -115,6 +125,15 @@ impl Options { self } + pub fn check_filenames(&self) -> Option { + match (self.check_filenames, self.no_check_filenames) { + (true, false) => Some(true), + (false, true) => Some(false), + (false, false) => None, + (_, _) => unreachable!("StructOpt should make this impossible"), + } + } + pub fn ignore_hex(&self) -> Option { match (self.no_hex, self.hex) { (true, false) => Some(false), @@ -197,6 +216,7 @@ fn run() -> Result<(), failure::Error> { let options = Options::from_args().infer(); let dictionary = typos::Dictionary::new(); + let check_filenames = options.check_filenames().unwrap_or(true); let ignore_hex = options.ignore_hex().unwrap_or(true); let binary = options.binary().unwrap_or(false); @@ -222,6 +242,7 @@ fn run() -> Result<(), failure::Error> { typos::process_file( entry.path(), &dictionary, + check_filenames, ignore_hex, binary, options.format.report(), diff --git a/src/report.rs b/src/report.rs index 6247264..23b5c47 100644 --- a/src/report.rs +++ b/src/report.rs @@ -7,6 +7,7 @@ use std::io::{self, Write}; pub enum Message<'m> { BinaryFile(BinaryFile<'m>), Correction(Correction<'m>), + FilenameCorrection(FilenameCorrection<'m>), } impl<'m> From> for Message<'m> { @@ -21,6 +22,12 @@ impl<'m> From> for Message<'m> { } } +impl<'m> From> for Message<'m> { + fn from(msg: FilenameCorrection<'m>) -> Self { + Message::FilenameCorrection(msg) + } +} + #[derive(Clone, Debug, Serialize)] pub struct BinaryFile<'m> { pub path: &'m std::path::Path, @@ -41,6 +48,15 @@ pub struct Correction<'m> { pub(crate) non_exhaustive: (), } +#[derive(Clone, Debug, Serialize)] +pub struct FilenameCorrection<'m> { + pub path: &'m std::path::Path, + pub typo: &'m str, + pub correction: Cow<'m, str>, + #[serde(skip)] + pub(crate) non_exhaustive: (), +} + pub type Report = fn(msg: Message); pub fn print_silent(_: Message) {} @@ -60,6 +76,9 @@ pub fn print_brief(msg: Message) { msg.correction ); } + Message::FilenameCorrection(msg) => { + println!("{}: {} -> {}", msg.path.display(), msg.typo, msg.correction); + } } } @@ -69,6 +88,14 @@ pub fn print_long(msg: Message) { println!("Skipping binary file {}", msg.path.display(),); } Message::Correction(msg) => print_long_correction(msg), + Message::FilenameCorrection(msg) => { + println!( + "{}: error: `{}` should be `{}`", + msg.path.display(), + msg.typo, + msg.correction + ); + } } } diff --git a/src/tokens.rs b/src/tokens.rs index 1543385..2d8c09a 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -38,6 +38,17 @@ impl<'t> Identifier<'t> { Self { token, offset } } + pub fn parse(content: &str) -> impl Iterator> { + lazy_static::lazy_static! { + // Getting false positives for this lint + #[allow(clippy::invalid_regex)] + static ref SPLIT: regex::Regex = regex::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap(); + } + SPLIT + .find_iter(content) + .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) + } + pub fn parse_bytes(content: &[u8]) -> impl Iterator> { lazy_static::lazy_static! { // Getting false positives for this lint @@ -240,58 +251,70 @@ mod test { #[test] fn tokenize_empty_is_empty() { - let input = b""; + let input = ""; let expected: Vec = vec![]; - let actual: Vec<_> = Identifier::parse_bytes(input).collect(); + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_word_is_word() { - let input = b"word"; + let input = "word"; let expected: Vec = vec![Identifier::new_unchecked("word", 0)]; - let actual: Vec<_> = Identifier::parse_bytes(input).collect(); + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_space_separated_words() { - let input = b"A B"; + let input = "A B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 2), ]; - let actual: Vec<_> = Identifier::parse_bytes(input).collect(); + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_dot_separated_words() { - let input = b"A.B"; + let input = "A.B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 2), ]; - let actual: Vec<_> = Identifier::parse_bytes(input).collect(); + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_namespace_separated_words() { - let input = b"A::B"; + let input = "A::B"; let expected: Vec = vec![ Identifier::new_unchecked("A", 0), Identifier::new_unchecked("B", 3), ]; - let actual: Vec<_> = Identifier::parse_bytes(input).collect(); + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_underscore_doesnt_separate() { - let input = b"A_B"; + let input = "A_B"; let expected: Vec = vec![Identifier::new_unchecked("A_B", 0)]; - let actual: Vec<_> = Identifier::parse_bytes(input).collect(); + let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = Identifier::parse(input).collect(); assert_eq!(expected, actual); }