Merge pull request #40 from epage/name

feat: Check file names
2024-11-25 10:31:02 -05:00 · 2019-07-19 21:12:17 -06:00 · 2019-07-19 21:12:17 -06:00 · 2c7dc5505c
commit 2c7dc5505c
parent 807a4a8a2f 95c0aea484
7 changed files with 239 additions and 54 deletions
--- a/benches/file.rs
+++ b/benches/file.rs
@ -18,6 +18,8 @@ fn process_empty(b: &mut test::Bencher) {
            sample_path.path(),
            &corrections,
            true,
            true,
            true,
            false,
            typos::report::print_silent,
        )
@ -38,6 +40,8 @@ fn process_no_tokens(b: &mut test::Bencher) {
            sample_path.path(),
            &corrections,
            true,
            true,
            true,
            false,
            typos::report::print_silent,
        )
@ -58,6 +62,8 @@ fn process_single_token(b: &mut test::Bencher) {
            sample_path.path(),
            &corrections,
            true,
            true,
            true,
            false,
            typos::report::print_silent,
        )
@ -78,6 +84,8 @@ fn process_sherlock(b: &mut test::Bencher) {
            sample_path.path(),
            &corrections,
            true,
            true,
            true,
            false,
            typos::report::print_silent,
        )
@ -98,6 +106,8 @@ fn process_code(b: &mut test::Bencher) {
            sample_path.path(),
            &corrections,
            true,
            true,
            true,
            false,
            typos::report::print_silent,
        )
@ -118,6 +128,8 @@ fn process_corpus(b: &mut test::Bencher) {
            sample_path.path(),
            &corrections,
            true,
            true,
            true,
            false,
            typos::report::print_silent,
        )
--- a/benches/tokenize.rs
+++ b/benches/tokenize.rs
@ -6,34 +6,34 @@ mod data;
 #[bench]
 fn symbol_parse_empty(b: &mut test::Bencher) {
-    b.iter(|| typos::tokens::Identifier::parse(data::EMPTY.as_bytes()).last());
+    b.iter(|| typos::tokens::Identifier::parse_bytes(data::EMPTY.as_bytes()).last());
 }
 #[bench]
 fn symbol_parse_no_tokens(b: &mut test::Bencher) {
-    b.iter(|| typos::tokens::Identifier::parse(data::NO_TOKENS.as_bytes()).last());
+    b.iter(|| typos::tokens::Identifier::parse_bytes(data::NO_TOKENS.as_bytes()).last());
 }
 #[bench]
 fn symbol_parse_single_token(b: &mut test::Bencher) {
    b.iter(|| {
-        typos::tokens::Identifier::parse(data::SINGLE_TOKEN.as_bytes()).last();
+        typos::tokens::Identifier::parse_bytes(data::SINGLE_TOKEN.as_bytes()).last();
    });
 }
 #[bench]
 fn symbol_parse_sherlock(b: &mut test::Bencher) {
-    b.iter(|| typos::tokens::Identifier::parse(data::SHERLOCK.as_bytes()).last());
+    b.iter(|| typos::tokens::Identifier::parse_bytes(data::SHERLOCK.as_bytes()).last());
 }
 #[bench]
 fn symbol_parse_code(b: &mut test::Bencher) {
-    b.iter(|| typos::tokens::Identifier::parse(data::CODE.as_bytes()).last());
+    b.iter(|| typos::tokens::Identifier::parse_bytes(data::CODE.as_bytes()).last());
 }
 #[bench]
 fn symbol_parse_corpus(b: &mut test::Bencher) {
-    b.iter(|| typos::tokens::Identifier::parse(data::CORPUS.as_bytes()).last());
+    b.iter(|| typos::tokens::Identifier::parse_bytes(data::CORPUS.as_bytes()).last());
 }
 #[bench]
--- a/docs/about.md
+++ b/docs/about.md
@ -46,7 +46,7 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh
 | Whole-project  | Yes                   | Yes                             | Yes                             | Yes         | No          |
 | Ignores hidden | Yes                   | Yes                             | ?                               | Yes         | No          |
 | Respect gitignore | Yes                | Yes                             | ?                               | No          | No          |
-| Checks filenames | No ([#24][def-24])  | No                              | ?                               | Yes         | No          |
+| Checks filenames | Yes                 | No                              | ?                               | Yes         | No          |
 | API            | Rust / [JSON Lines]   | Rust                            | ?                               | Python      | None        |
 | License        | MIT or Apache         | AGPL                            | MIT                             | GPLv2       | GPLv2       |
@ -59,5 +59,4 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh
 [def-14]: https://github.com/epage/typos/issues/14
 [def-17]: https://github.com/epage/typos/issues/17
 [def-18]: https://github.com/epage/typos/issues/18
 [def-24]: https://github.com/epage/typos/issues/24
 [def-3]: https://github.com/epage/typos/issues/3
--- a/src/lib.rs
+++ b/src/lib.rs
@ -17,48 +17,87 @@ use bstr::ByteSlice;
 pub fn process_file(
    path: &std::path::Path,
    dictionary: &Dictionary,
    check_filenames: bool,
    check_files: bool,
    ignore_hex: bool,
    binary: bool,
    report: report::Report,
 ) -> Result<(), failure::Error> {
-    let mut buffer = Vec::new();
+    if check_filenames {
-    File::open(path)?.read_to_end(&mut buffer)?;
+        for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
-    if !binary && buffer.find_byte(b'\0').is_some() {
+            for ident in tokens::Identifier::parse(part) {
-        return Ok(());
+                if !ignore_hex && is_hex(ident.token()) {
                    continue;
                }
                if let Some(correction) = dictionary.correct_ident(ident) {
                    let msg = report::FilenameCorrection {
                        path,
                        typo: ident.token(),
                        correction,
                        non_exhaustive: (),
                    };
                    report(msg.into());
                }
                for word in ident.split() {
                    if let Some(correction) = dictionary.correct_word(word) {
                        let msg = report::FilenameCorrection {
                            path,
                            typo: word.token(),
                            correction,
                            non_exhaustive: (),
                        };
                        report(msg.into());
                    }
                }
            }
        }
    }
-    for (line_idx, line) in buffer.lines().enumerate() {
+    if check_files {
-        let line_num = line_idx + 1;
+        let mut buffer = Vec::new();
-        for ident in tokens::Identifier::parse(line) {
+        File::open(path)?.read_to_end(&mut buffer)?;
-            if !ignore_hex && is_hex(ident.token()) {
+        if !binary && buffer.find_byte(b'\0').is_some() {
-                continue;
+            let msg = report::BinaryFile {
-            }
+                path,
-            if let Some(correction) = dictionary.correct_ident(ident) {
+                non_exhaustive: (),
-                let col_num = ident.offset();
+            };
-                let msg = report::Message {
+            report(msg.into());
-                    path,
+            return Ok(());
-                    line,
+        }
-                    line_num,
+
-                    col_num,
+        for (line_idx, line) in buffer.lines().enumerate() {
-                    typo: ident.token(),
+            let line_num = line_idx + 1;
-                    correction,
+            for ident in tokens::Identifier::parse_bytes(line) {
-                    non_exhaustive: (),
+                if !ignore_hex && is_hex(ident.token()) {
-                };
+                    continue;
-                report(msg);
+                }
-            }
+                if let Some(correction) = dictionary.correct_ident(ident) {
-            for word in ident.split() {
+                    let col_num = ident.offset();
-                if let Some(correction) = dictionary.correct_word(word) {
+                    let msg = report::Correction {
                    let col_num = word.offset();
                    let msg = report::Message {
                        path,
                        line,
                        line_num,
                        col_num,
-                        typo: word.token(),
+                        typo: ident.token(),
                        correction,
                        non_exhaustive: (),
                    };
-                    report(msg);
+                    report(msg.into());
                }
                for word in ident.split() {
                    if let Some(correction) = dictionary.correct_word(word) {
                        let col_num = word.offset();
                        let msg = report::Correction {
                            path,
                            line,
                            line_num,
                            col_num,
                            typo: word.token(),
                            correction,
                            non_exhaustive: (),
                        };
                        report(msg.into());
                    }
                }
            }
        }
--- a/src/main.rs
+++ b/src/main.rs
@ -38,6 +38,26 @@ struct Options {
    /// Paths to check
    path: Vec<std::path::PathBuf>,
    #[structopt(long, raw(overrides_with = r#""check-filenames""#))]
    /// Skip verifying spelling in file names.
    no_check_filenames: bool,
    #[structopt(
        long,
        raw(overrides_with = r#""no-check-filenames""#),
        raw(hidden = "true")
    )]
    check_filenames: bool,
    #[structopt(long, raw(overrides_with = r#""check-files""#))]
    /// Skip verifying spelling in filess.
    no_check_files: bool,
    #[structopt(
        long,
        raw(overrides_with = r#""no-check-files""#),
        raw(hidden = "true")
    )]
    check_files: bool,
    #[structopt(long, raw(overrides_with = r#""hex""#))]
    /// Don't try to detect that an identifier looks like hex
    no_hex: bool,
@ -115,6 +135,24 @@ impl Options {
        self
    }
    pub fn check_files(&self) -> Option<bool> {
        match (self.check_files, self.no_check_files) {
            (true, false) => Some(true),
            (false, true) => Some(false),
            (false, false) => None,
            (_, _) => unreachable!("StructOpt should make this impossible"),
        }
    }
    pub fn check_filenames(&self) -> Option<bool> {
        match (self.check_filenames, self.no_check_filenames) {
            (true, false) => Some(true),
            (false, true) => Some(false),
            (false, false) => None,
            (_, _) => unreachable!("StructOpt should make this impossible"),
        }
    }
    pub fn ignore_hex(&self) -> Option<bool> {
        match (self.no_hex, self.hex) {
            (true, false) => Some(false),
@ -197,6 +235,8 @@ fn run() -> Result<(), failure::Error> {
    let options = Options::from_args().infer();
    let dictionary = typos::Dictionary::new();
    let check_filenames = options.check_filenames().unwrap_or(true);
    let check_files = options.check_files().unwrap_or(true);
    let ignore_hex = options.ignore_hex().unwrap_or(true);
    let binary = options.binary().unwrap_or(false);
@ -222,6 +262,8 @@ fn run() -> Result<(), failure::Error> {
            typos::process_file(
                entry.path(),
                &dictionary,
                check_filenames,
                check_files,
                ignore_hex,
                binary,
                options.format.report(),
--- a/src/report.rs
+++ b/src/report.rs
@ -2,7 +2,41 @@ use std::borrow::Cow;
 use std::io::{self, Write};
 #[derive(Clone, Debug, Serialize)]
-pub struct Message<'m> {
+#[serde(rename_all = "snake_case")]
 #[serde(tag = "type")]
 pub enum Message<'m> {
    BinaryFile(BinaryFile<'m>),
    Correction(Correction<'m>),
    FilenameCorrection(FilenameCorrection<'m>),
 }
 impl<'m> From<BinaryFile<'m>> for Message<'m> {
    fn from(msg: BinaryFile<'m>) -> Self {
        Message::BinaryFile(msg)
    }
 }
 impl<'m> From<Correction<'m>> for Message<'m> {
    fn from(msg: Correction<'m>) -> Self {
        Message::Correction(msg)
    }
 }
 impl<'m> From<FilenameCorrection<'m>> for Message<'m> {
    fn from(msg: FilenameCorrection<'m>) -> Self {
        Message::FilenameCorrection(msg)
    }
 }
 #[derive(Clone, Debug, Serialize)]
 pub struct BinaryFile<'m> {
    pub path: &'m std::path::Path,
    #[serde(skip)]
    pub(crate) non_exhaustive: (),
 }
 #[derive(Clone, Debug, Serialize)]
 pub struct Correction<'m> {
    pub path: &'m std::path::Path,
    #[serde(skip)]
    pub line: &'m [u8],
@ -14,22 +48,58 @@ pub struct Message<'m> {
    pub(crate) non_exhaustive: (),
 }
 #[derive(Clone, Debug, Serialize)]
 pub struct FilenameCorrection<'m> {
    pub path: &'m std::path::Path,
    pub typo: &'m str,
    pub correction: Cow<'m, str>,
    #[serde(skip)]
    pub(crate) non_exhaustive: (),
 }
 pub type Report = fn(msg: Message);
 pub fn print_silent(_: Message) {}
 pub fn print_brief(msg: Message) {
-    println!(
+    match msg {
-        "{}:{}:{}: {} -> {}",
+        Message::BinaryFile(msg) => {
-        msg.path.display(),
+            println!("Skipping binary file {}", msg.path.display(),);
-        msg.line_num,
+        }
-        msg.col_num,
+        Message::Correction(msg) => {
-        msg.typo,
+            println!(
-        msg.correction
+                "{}:{}:{}: {} -> {}",
-    );
+                msg.path.display(),
                msg.line_num,
                msg.col_num,
                msg.typo,
                msg.correction
            );
        }
        Message::FilenameCorrection(msg) => {
            println!("{}: {} -> {}", msg.path.display(), msg.typo, msg.correction);
        }
    }
 }
 pub fn print_long(msg: Message) {
    match msg {
        Message::BinaryFile(msg) => {
            println!("Skipping binary file {}", msg.path.display(),);
        }
        Message::Correction(msg) => print_long_correction(msg),
        Message::FilenameCorrection(msg) => {
            println!(
                "{}: error: `{}` should be `{}`",
                msg.path.display(),
                msg.typo,
                msg.correction
            );
        }
    }
 }
 fn print_long_correction(msg: Correction) {
    let line_num = msg.line_num.to_string();
    let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect();
--- a/src/tokens.rs
+++ b/src/tokens.rs
@ -14,7 +14,7 @@ pub struct Identifier<'t> {
 impl<'t> Identifier<'t> {
    pub fn new(token: &'t str, offset: usize) -> Result<Self, failure::Error> {
-        let mut itr = Self::parse(token.as_bytes());
+        let mut itr = Self::parse_bytes(token.as_bytes());
        let mut item = itr
            .next()
            .ok_or_else(|| failure::format_err!("Invalid ident (none found): {:?}", token))?;
@ -38,7 +38,18 @@ impl<'t> Identifier<'t> {
        Self { token, offset }
    }
-    pub fn parse(content: &[u8]) -> impl Iterator<Item = Identifier<'_>> {
+    pub fn parse(content: &str) -> impl Iterator<Item = Identifier<'_>> {
        lazy_static::lazy_static! {
            // Getting false positives for this lint
            #[allow(clippy::invalid_regex)]
            static ref SPLIT: regex::Regex = regex::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap();
        }
        SPLIT
            .find_iter(content)
            .map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
    }
    pub fn parse_bytes(content: &[u8]) -> impl Iterator<Item = Identifier<'_>> {
        lazy_static::lazy_static! {
            // Getting false positives for this lint
            #[allow(clippy::invalid_regex)]
@ -240,57 +251,69 @@ mod test {
    #[test]
    fn tokenize_empty_is_empty() {
-        let input = b"";
+        let input = "";
        let expected: Vec<Identifier> = vec![];
        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = Identifier::parse(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_word_is_word() {
-        let input = b"word";
+        let input = "word";
        let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = Identifier::parse(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_space_separated_words() {
-        let input = b"A B";
+        let input = "A B";
        let expected: Vec<Identifier> = vec![
            Identifier::new_unchecked("A", 0),
            Identifier::new_unchecked("B", 2),
        ];
        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = Identifier::parse(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_dot_separated_words() {
-        let input = b"A.B";
+        let input = "A.B";
        let expected: Vec<Identifier> = vec![
            Identifier::new_unchecked("A", 0),
            Identifier::new_unchecked("B", 2),
        ];
        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = Identifier::parse(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_namespace_separated_words() {
-        let input = b"A::B";
+        let input = "A::B";
        let expected: Vec<Identifier> = vec![
            Identifier::new_unchecked("A", 0),
            Identifier::new_unchecked("B", 3),
        ];
        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = Identifier::parse(input).collect();
        assert_eq!(expected, actual);
    }
    #[test]
    fn tokenize_underscore_doesnt_separate() {
-        let input = b"A_B";
+        let input = "A_B";
        let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
        let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = Identifier::parse(input).collect();
        assert_eq!(expected, actual);
    }