From ec307dffddf39d46428c8bdfc0af1f1e45dd06ef Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Thu, 18 Jul 2019 20:20:45 -0600
Subject: [PATCH] feat: Check file names

Fixes #24
---
 benches/file.rs |  6 ++++++
 docs/about.md   |  3 +--
 src/lib.rs      | 31 +++++++++++++++++++++++++++++++
 src/main.rs     | 21 +++++++++++++++++++++
 src/report.rs   | 27 +++++++++++++++++++++++++++
 src/tokens.rs   | 47 +++++++++++++++++++++++++++++++++++------------
 6 files changed, 121 insertions(+), 14 deletions(-)

diff --git a/benches/file.rs b/benches/file.rs
index 6656701..b937547 100644
--- a/benches/file.rs
+++ b/benches/file.rs
@@ -18,6 +18,7 @@ fn process_empty(b: &mut test::Bencher) {
             sample_path.path(),
             &corrections,
             true,
+            true,
             false,
             typos::report::print_silent,
         )
@@ -38,6 +39,7 @@ fn process_no_tokens(b: &mut test::Bencher) {
             sample_path.path(),
             &corrections,
             true,
+            true,
             false,
             typos::report::print_silent,
         )
@@ -58,6 +60,7 @@ fn process_single_token(b: &mut test::Bencher) {
             sample_path.path(),
             &corrections,
             true,
+            true,
             false,
             typos::report::print_silent,
         )
@@ -78,6 +81,7 @@ fn process_sherlock(b: &mut test::Bencher) {
             sample_path.path(),
             &corrections,
             true,
+            true,
             false,
             typos::report::print_silent,
         )
@@ -98,6 +102,7 @@ fn process_code(b: &mut test::Bencher) {
             sample_path.path(),
             &corrections,
             true,
+            true,
             false,
             typos::report::print_silent,
         )
@@ -118,6 +123,7 @@ fn process_corpus(b: &mut test::Bencher) {
             sample_path.path(),
             &corrections,
             true,
+            true,
             false,
             typos::report::print_silent,
         )
diff --git a/docs/about.md b/docs/about.md
index d06ae8a..88340ea 100644
--- a/docs/about.md
+++ b/docs/about.md
@@ -46,7 +46,7 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh
 | Whole-project  | Yes                   | Yes                             | Yes                             | Yes         | No          |
 | Ignores hidden | Yes                   | Yes                             | ?                               | Yes         | No          |
 | Respect gitignore | Yes                | Yes                             | ?                               | No          | No          |
-| Checks filenames | No ([#24][def-24])  | No                              | ?                               | Yes         | No          |
+| Checks filenames | Yes                 | No                              | ?                               | Yes         | No          |
 | API            | Rust / [JSON Lines]   | Rust                            | ?                               | Python      | None        |
 | License        | MIT or Apache         | AGPL                            | MIT                             | GPLv2       | GPLv2       |
 
@@ -59,5 +59,4 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh
 [def-14]: https://github.com/epage/typos/issues/14
 [def-17]: https://github.com/epage/typos/issues/17
 [def-18]: https://github.com/epage/typos/issues/18
-[def-24]: https://github.com/epage/typos/issues/24
 [def-3]: https://github.com/epage/typos/issues/3
diff --git a/src/lib.rs b/src/lib.rs
index f2d9b99..8121bfb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,10 +17,41 @@ use bstr::ByteSlice;
 pub fn process_file(
     path: &std::path::Path,
     dictionary: &Dictionary,
+    check_filenames: bool,
     ignore_hex: bool,
     binary: bool,
     report: report::Report,
 ) -> Result<(), failure::Error> {
+    if check_filenames {
+        for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
+            for ident in tokens::Identifier::parse(part) {
+                if !ignore_hex && is_hex(ident.token()) {
+                    continue;
+                }
+                if let Some(correction) = dictionary.correct_ident(ident) {
+                    let msg = report::FilenameCorrection {
+                        path,
+                        typo: ident.token(),
+                        correction,
+                        non_exhaustive: (),
+                    };
+                    report(msg.into());
+                }
+                for word in ident.split() {
+                    if let Some(correction) = dictionary.correct_word(word) {
+                        let msg = report::FilenameCorrection {
+                            path,
+                            typo: word.token(),
+                            correction,
+                            non_exhaustive: (),
+                        };
+                        report(msg.into());
+                    }
+                }
+            }
+        }
+    }
+
     let mut buffer = Vec::new();
     File::open(path)?.read_to_end(&mut buffer)?;
     if !binary && buffer.find_byte(b'\0').is_some() {
diff --git a/src/main.rs b/src/main.rs
index 38f025a..2d999e4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -38,6 +38,16 @@ struct Options {
     /// Paths to check
     path: Vec<std::path::PathBuf>,
 
+    #[structopt(long, raw(overrides_with = r#""check-filenames""#))]
+    /// Skip verifying spelling in file names.
+    no_check_filenames: bool,
+    #[structopt(
+        long,
+        raw(overrides_with = r#""no-check-filenames""#),
+        raw(hidden = "true")
+    )]
+    check_filenames: bool,
+
     #[structopt(long, raw(overrides_with = r#""hex""#))]
     /// Don't try to detect that an identifier looks like hex
     no_hex: bool,
@@ -115,6 +125,15 @@ impl Options {
         self
     }
 
+    pub fn check_filenames(&self) -> Option<bool> {
+        match (self.check_filenames, self.no_check_filenames) {
+            (true, false) => Some(true),
+            (false, true) => Some(false),
+            (false, false) => None,
+            (_, _) => unreachable!("StructOpt should make this impossible"),
+        }
+    }
+
     pub fn ignore_hex(&self) -> Option<bool> {
         match (self.no_hex, self.hex) {
             (true, false) => Some(false),
@@ -197,6 +216,7 @@ fn run() -> Result<(), failure::Error> {
     let options = Options::from_args().infer();
 
     let dictionary = typos::Dictionary::new();
+    let check_filenames = options.check_filenames().unwrap_or(true);
     let ignore_hex = options.ignore_hex().unwrap_or(true);
     let binary = options.binary().unwrap_or(false);
 
@@ -222,6 +242,7 @@ fn run() -> Result<(), failure::Error> {
             typos::process_file(
                 entry.path(),
                 &dictionary,
+                check_filenames,
                 ignore_hex,
                 binary,
                 options.format.report(),
diff --git a/src/report.rs b/src/report.rs
index 6247264..23b5c47 100644
--- a/src/report.rs
+++ b/src/report.rs
@@ -7,6 +7,7 @@ use std::io::{self, Write};
 pub enum Message<'m> {
     BinaryFile(BinaryFile<'m>),
     Correction(Correction<'m>),
+    FilenameCorrection(FilenameCorrection<'m>),
 }
 
 impl<'m> From<BinaryFile<'m>> for Message<'m> {
@@ -21,6 +22,12 @@ impl<'m> From<Correction<'m>> for Message<'m> {
     }
 }
 
+impl<'m> From<FilenameCorrection<'m>> for Message<'m> {
+    fn from(msg: FilenameCorrection<'m>) -> Self {
+        Message::FilenameCorrection(msg)
+    }
+}
+
 #[derive(Clone, Debug, Serialize)]
 pub struct BinaryFile<'m> {
     pub path: &'m std::path::Path,
@@ -41,6 +48,15 @@ pub struct Correction<'m> {
     pub(crate) non_exhaustive: (),
 }
 
+#[derive(Clone, Debug, Serialize)]
+pub struct FilenameCorrection<'m> {
+    pub path: &'m std::path::Path,
+    pub typo: &'m str,
+    pub correction: Cow<'m, str>,
+    #[serde(skip)]
+    pub(crate) non_exhaustive: (),
+}
+
 pub type Report = fn(msg: Message);
 
 pub fn print_silent(_: Message) {}
@@ -60,6 +76,9 @@ pub fn print_brief(msg: Message) {
                 msg.correction
             );
         }
+        Message::FilenameCorrection(msg) => {
+            println!("{}: {} -> {}", msg.path.display(), msg.typo, msg.correction);
+        }
     }
 }
 
@@ -69,6 +88,14 @@ pub fn print_long(msg: Message) {
             println!("Skipping binary file {}", msg.path.display(),);
         }
         Message::Correction(msg) => print_long_correction(msg),
+        Message::FilenameCorrection(msg) => {
+            println!(
+                "{}: error: `{}` should be `{}`",
+                msg.path.display(),
+                msg.typo,
+                msg.correction
+            );
+        }
     }
 }
 
diff --git a/src/tokens.rs b/src/tokens.rs
index 1543385..2d8c09a 100644
--- a/src/tokens.rs
+++ b/src/tokens.rs
@@ -38,6 +38,17 @@ impl<'t> Identifier<'t> {
         Self { token, offset }
     }
 
+    pub fn parse(content: &str) -> impl Iterator<Item = Identifier<'_>> {
+        lazy_static::lazy_static! {
+            // Getting false positives for this lint
+            #[allow(clippy::invalid_regex)]
+            static ref SPLIT: regex::Regex = regex::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap();
+        }
+        SPLIT
+            .find_iter(content)
+            .map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
+    }
+
     pub fn parse_bytes(content: &[u8]) -> impl Iterator<Item = Identifier<'_>> {
         lazy_static::lazy_static! {
             // Getting false positives for this lint
@@ -240,58 +251,70 @@ mod test {
 
     #[test]
     fn tokenize_empty_is_empty() {
-        let input = b"";
+        let input = "";
         let expected: Vec<Identifier> = vec![];
-        let actual: Vec<_> = Identifier::parse_bytes(input).collect();
+        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = Identifier::parse(input).collect();
         assert_eq!(expected, actual);
     }
 
     #[test]
     fn tokenize_word_is_word() {
-        let input = b"word";
+        let input = "word";
         let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
-        let actual: Vec<_> = Identifier::parse_bytes(input).collect();
+        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = Identifier::parse(input).collect();
         assert_eq!(expected, actual);
     }
 
     #[test]
     fn tokenize_space_separated_words() {
-        let input = b"A B";
+        let input = "A B";
         let expected: Vec<Identifier> = vec![
             Identifier::new_unchecked("A", 0),
             Identifier::new_unchecked("B", 2),
         ];
-        let actual: Vec<_> = Identifier::parse_bytes(input).collect();
+        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = Identifier::parse(input).collect();
         assert_eq!(expected, actual);
     }
 
     #[test]
     fn tokenize_dot_separated_words() {
-        let input = b"A.B";
+        let input = "A.B";
         let expected: Vec<Identifier> = vec![
             Identifier::new_unchecked("A", 0),
             Identifier::new_unchecked("B", 2),
         ];
-        let actual: Vec<_> = Identifier::parse_bytes(input).collect();
+        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = Identifier::parse(input).collect();
         assert_eq!(expected, actual);
     }
 
     #[test]
     fn tokenize_namespace_separated_words() {
-        let input = b"A::B";
+        let input = "A::B";
         let expected: Vec<Identifier> = vec![
             Identifier::new_unchecked("A", 0),
             Identifier::new_unchecked("B", 3),
         ];
-        let actual: Vec<_> = Identifier::parse_bytes(input).collect();
+        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = Identifier::parse(input).collect();
         assert_eq!(expected, actual);
     }
 
     #[test]
     fn tokenize_underscore_doesnt_separate() {
-        let input = b"A_B";
+        let input = "A_B";
         let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
-        let actual: Vec<_> = Identifier::parse_bytes(input).collect();
+        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = Identifier::parse(input).collect();
         assert_eq!(expected, actual);
     }