Merge pull request #40 from epage/name

feat: Check file names
This commit is contained in:
Ed Page 2019-07-19 21:12:17 -06:00 committed by GitHub
commit 2c7dc5505c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 239 additions and 54 deletions

View file

@ -18,6 +18,8 @@ fn process_empty(b: &mut test::Bencher) {
sample_path.path(), sample_path.path(),
&corrections, &corrections,
true, true,
true,
true,
false, false,
typos::report::print_silent, typos::report::print_silent,
) )
@ -38,6 +40,8 @@ fn process_no_tokens(b: &mut test::Bencher) {
sample_path.path(), sample_path.path(),
&corrections, &corrections,
true, true,
true,
true,
false, false,
typos::report::print_silent, typos::report::print_silent,
) )
@ -58,6 +62,8 @@ fn process_single_token(b: &mut test::Bencher) {
sample_path.path(), sample_path.path(),
&corrections, &corrections,
true, true,
true,
true,
false, false,
typos::report::print_silent, typos::report::print_silent,
) )
@ -78,6 +84,8 @@ fn process_sherlock(b: &mut test::Bencher) {
sample_path.path(), sample_path.path(),
&corrections, &corrections,
true, true,
true,
true,
false, false,
typos::report::print_silent, typos::report::print_silent,
) )
@ -98,6 +106,8 @@ fn process_code(b: &mut test::Bencher) {
sample_path.path(), sample_path.path(),
&corrections, &corrections,
true, true,
true,
true,
false, false,
typos::report::print_silent, typos::report::print_silent,
) )
@ -118,6 +128,8 @@ fn process_corpus(b: &mut test::Bencher) {
sample_path.path(), sample_path.path(),
&corrections, &corrections,
true, true,
true,
true,
false, false,
typos::report::print_silent, typos::report::print_silent,
) )

View file

@ -6,34 +6,34 @@ mod data;
#[bench] #[bench]
fn symbol_parse_empty(b: &mut test::Bencher) { fn symbol_parse_empty(b: &mut test::Bencher) {
b.iter(|| typos::tokens::Identifier::parse(data::EMPTY.as_bytes()).last()); b.iter(|| typos::tokens::Identifier::parse_bytes(data::EMPTY.as_bytes()).last());
} }
#[bench] #[bench]
fn symbol_parse_no_tokens(b: &mut test::Bencher) { fn symbol_parse_no_tokens(b: &mut test::Bencher) {
b.iter(|| typos::tokens::Identifier::parse(data::NO_TOKENS.as_bytes()).last()); b.iter(|| typos::tokens::Identifier::parse_bytes(data::NO_TOKENS.as_bytes()).last());
} }
#[bench] #[bench]
fn symbol_parse_single_token(b: &mut test::Bencher) { fn symbol_parse_single_token(b: &mut test::Bencher) {
b.iter(|| { b.iter(|| {
typos::tokens::Identifier::parse(data::SINGLE_TOKEN.as_bytes()).last(); typos::tokens::Identifier::parse_bytes(data::SINGLE_TOKEN.as_bytes()).last();
}); });
} }
#[bench] #[bench]
fn symbol_parse_sherlock(b: &mut test::Bencher) { fn symbol_parse_sherlock(b: &mut test::Bencher) {
b.iter(|| typos::tokens::Identifier::parse(data::SHERLOCK.as_bytes()).last()); b.iter(|| typos::tokens::Identifier::parse_bytes(data::SHERLOCK.as_bytes()).last());
} }
#[bench] #[bench]
fn symbol_parse_code(b: &mut test::Bencher) { fn symbol_parse_code(b: &mut test::Bencher) {
b.iter(|| typos::tokens::Identifier::parse(data::CODE.as_bytes()).last()); b.iter(|| typos::tokens::Identifier::parse_bytes(data::CODE.as_bytes()).last());
} }
#[bench] #[bench]
fn symbol_parse_corpus(b: &mut test::Bencher) { fn symbol_parse_corpus(b: &mut test::Bencher) {
b.iter(|| typos::tokens::Identifier::parse(data::CORPUS.as_bytes()).last()); b.iter(|| typos::tokens::Identifier::parse_bytes(data::CORPUS.as_bytes()).last());
} }
#[bench] #[bench]

View file

@ -46,7 +46,7 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh
| Whole-project | Yes | Yes | Yes | Yes | No | | Whole-project | Yes | Yes | Yes | Yes | No |
| Ignores hidden | Yes | Yes | ? | Yes | No | | Ignores hidden | Yes | Yes | ? | Yes | No |
| Respect gitignore | Yes | Yes | ? | No | No | | Respect gitignore | Yes | Yes | ? | No | No |
| Checks filenames | No ([#24][def-24]) | No | ? | Yes | No | | Checks filenames | Yes | No | ? | Yes | No |
| API | Rust / [JSON Lines] | Rust | ? | Python | None | | API | Rust / [JSON Lines] | Rust | ? | Python | None |
| License | MIT or Apache | AGPL | MIT | GPLv2 | GPLv2 | | License | MIT or Apache | AGPL | MIT | GPLv2 | GPLv2 |
@ -59,5 +59,4 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh
[def-14]: https://github.com/epage/typos/issues/14 [def-14]: https://github.com/epage/typos/issues/14
[def-17]: https://github.com/epage/typos/issues/17 [def-17]: https://github.com/epage/typos/issues/17
[def-18]: https://github.com/epage/typos/issues/18 [def-18]: https://github.com/epage/typos/issues/18
[def-24]: https://github.com/epage/typos/issues/24
[def-3]: https://github.com/epage/typos/issues/3 [def-3]: https://github.com/epage/typos/issues/3

View file

@ -17,48 +17,87 @@ use bstr::ByteSlice;
pub fn process_file( pub fn process_file(
path: &std::path::Path, path: &std::path::Path,
dictionary: &Dictionary, dictionary: &Dictionary,
check_filenames: bool,
check_files: bool,
ignore_hex: bool, ignore_hex: bool,
binary: bool, binary: bool,
report: report::Report, report: report::Report,
) -> Result<(), failure::Error> { ) -> Result<(), failure::Error> {
let mut buffer = Vec::new(); if check_filenames {
File::open(path)?.read_to_end(&mut buffer)?; for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
if !binary && buffer.find_byte(b'\0').is_some() { for ident in tokens::Identifier::parse(part) {
return Ok(()); if !ignore_hex && is_hex(ident.token()) {
continue;
}
if let Some(correction) = dictionary.correct_ident(ident) {
let msg = report::FilenameCorrection {
path,
typo: ident.token(),
correction,
non_exhaustive: (),
};
report(msg.into());
}
for word in ident.split() {
if let Some(correction) = dictionary.correct_word(word) {
let msg = report::FilenameCorrection {
path,
typo: word.token(),
correction,
non_exhaustive: (),
};
report(msg.into());
}
}
}
}
} }
for (line_idx, line) in buffer.lines().enumerate() { if check_files {
let line_num = line_idx + 1; let mut buffer = Vec::new();
for ident in tokens::Identifier::parse(line) { File::open(path)?.read_to_end(&mut buffer)?;
if !ignore_hex && is_hex(ident.token()) { if !binary && buffer.find_byte(b'\0').is_some() {
continue; let msg = report::BinaryFile {
} path,
if let Some(correction) = dictionary.correct_ident(ident) { non_exhaustive: (),
let col_num = ident.offset(); };
let msg = report::Message { report(msg.into());
path, return Ok(());
line, }
line_num,
col_num, for (line_idx, line) in buffer.lines().enumerate() {
typo: ident.token(), let line_num = line_idx + 1;
correction, for ident in tokens::Identifier::parse_bytes(line) {
non_exhaustive: (), if !ignore_hex && is_hex(ident.token()) {
}; continue;
report(msg); }
} if let Some(correction) = dictionary.correct_ident(ident) {
for word in ident.split() { let col_num = ident.offset();
if let Some(correction) = dictionary.correct_word(word) { let msg = report::Correction {
let col_num = word.offset();
let msg = report::Message {
path, path,
line, line,
line_num, line_num,
col_num, col_num,
typo: word.token(), typo: ident.token(),
correction, correction,
non_exhaustive: (), non_exhaustive: (),
}; };
report(msg); report(msg.into());
}
for word in ident.split() {
if let Some(correction) = dictionary.correct_word(word) {
let col_num = word.offset();
let msg = report::Correction {
path,
line,
line_num,
col_num,
typo: word.token(),
correction,
non_exhaustive: (),
};
report(msg.into());
}
} }
} }
} }

View file

@ -38,6 +38,26 @@ struct Options {
/// Paths to check /// Paths to check
path: Vec<std::path::PathBuf>, path: Vec<std::path::PathBuf>,
#[structopt(long, raw(overrides_with = r#""check-filenames""#))]
/// Skip verifying spelling in file names.
no_check_filenames: bool,
#[structopt(
long,
raw(overrides_with = r#""no-check-filenames""#),
raw(hidden = "true")
)]
check_filenames: bool,
#[structopt(long, raw(overrides_with = r#""check-files""#))]
/// Skip verifying spelling in filess.
no_check_files: bool,
#[structopt(
long,
raw(overrides_with = r#""no-check-files""#),
raw(hidden = "true")
)]
check_files: bool,
#[structopt(long, raw(overrides_with = r#""hex""#))] #[structopt(long, raw(overrides_with = r#""hex""#))]
/// Don't try to detect that an identifier looks like hex /// Don't try to detect that an identifier looks like hex
no_hex: bool, no_hex: bool,
@ -115,6 +135,24 @@ impl Options {
self self
} }
pub fn check_files(&self) -> Option<bool> {
match (self.check_files, self.no_check_files) {
(true, false) => Some(true),
(false, true) => Some(false),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
}
pub fn check_filenames(&self) -> Option<bool> {
match (self.check_filenames, self.no_check_filenames) {
(true, false) => Some(true),
(false, true) => Some(false),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
}
pub fn ignore_hex(&self) -> Option<bool> { pub fn ignore_hex(&self) -> Option<bool> {
match (self.no_hex, self.hex) { match (self.no_hex, self.hex) {
(true, false) => Some(false), (true, false) => Some(false),
@ -197,6 +235,8 @@ fn run() -> Result<(), failure::Error> {
let options = Options::from_args().infer(); let options = Options::from_args().infer();
let dictionary = typos::Dictionary::new(); let dictionary = typos::Dictionary::new();
let check_filenames = options.check_filenames().unwrap_or(true);
let check_files = options.check_files().unwrap_or(true);
let ignore_hex = options.ignore_hex().unwrap_or(true); let ignore_hex = options.ignore_hex().unwrap_or(true);
let binary = options.binary().unwrap_or(false); let binary = options.binary().unwrap_or(false);
@ -222,6 +262,8 @@ fn run() -> Result<(), failure::Error> {
typos::process_file( typos::process_file(
entry.path(), entry.path(),
&dictionary, &dictionary,
check_filenames,
check_files,
ignore_hex, ignore_hex,
binary, binary,
options.format.report(), options.format.report(),

View file

@ -2,7 +2,41 @@ use std::borrow::Cow;
use std::io::{self, Write}; use std::io::{self, Write};
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
pub struct Message<'m> { #[serde(rename_all = "snake_case")]
#[serde(tag = "type")]
pub enum Message<'m> {
BinaryFile(BinaryFile<'m>),
Correction(Correction<'m>),
FilenameCorrection(FilenameCorrection<'m>),
}
impl<'m> From<BinaryFile<'m>> for Message<'m> {
fn from(msg: BinaryFile<'m>) -> Self {
Message::BinaryFile(msg)
}
}
impl<'m> From<Correction<'m>> for Message<'m> {
fn from(msg: Correction<'m>) -> Self {
Message::Correction(msg)
}
}
impl<'m> From<FilenameCorrection<'m>> for Message<'m> {
fn from(msg: FilenameCorrection<'m>) -> Self {
Message::FilenameCorrection(msg)
}
}
#[derive(Clone, Debug, Serialize)]
pub struct BinaryFile<'m> {
pub path: &'m std::path::Path,
#[serde(skip)]
pub(crate) non_exhaustive: (),
}
#[derive(Clone, Debug, Serialize)]
pub struct Correction<'m> {
pub path: &'m std::path::Path, pub path: &'m std::path::Path,
#[serde(skip)] #[serde(skip)]
pub line: &'m [u8], pub line: &'m [u8],
@ -14,22 +48,58 @@ pub struct Message<'m> {
pub(crate) non_exhaustive: (), pub(crate) non_exhaustive: (),
} }
#[derive(Clone, Debug, Serialize)]
pub struct FilenameCorrection<'m> {
pub path: &'m std::path::Path,
pub typo: &'m str,
pub correction: Cow<'m, str>,
#[serde(skip)]
pub(crate) non_exhaustive: (),
}
pub type Report = fn(msg: Message); pub type Report = fn(msg: Message);
pub fn print_silent(_: Message) {} pub fn print_silent(_: Message) {}
pub fn print_brief(msg: Message) { pub fn print_brief(msg: Message) {
println!( match msg {
"{}:{}:{}: {} -> {}", Message::BinaryFile(msg) => {
msg.path.display(), println!("Skipping binary file {}", msg.path.display(),);
msg.line_num, }
msg.col_num, Message::Correction(msg) => {
msg.typo, println!(
msg.correction "{}:{}:{}: {} -> {}",
); msg.path.display(),
msg.line_num,
msg.col_num,
msg.typo,
msg.correction
);
}
Message::FilenameCorrection(msg) => {
println!("{}: {} -> {}", msg.path.display(), msg.typo, msg.correction);
}
}
} }
pub fn print_long(msg: Message) { pub fn print_long(msg: Message) {
match msg {
Message::BinaryFile(msg) => {
println!("Skipping binary file {}", msg.path.display(),);
}
Message::Correction(msg) => print_long_correction(msg),
Message::FilenameCorrection(msg) => {
println!(
"{}: error: `{}` should be `{}`",
msg.path.display(),
msg.typo,
msg.correction
);
}
}
}
fn print_long_correction(msg: Correction) {
let line_num = msg.line_num.to_string(); let line_num = msg.line_num.to_string();
let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect(); let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect();

View file

@ -14,7 +14,7 @@ pub struct Identifier<'t> {
impl<'t> Identifier<'t> { impl<'t> Identifier<'t> {
pub fn new(token: &'t str, offset: usize) -> Result<Self, failure::Error> { pub fn new(token: &'t str, offset: usize) -> Result<Self, failure::Error> {
let mut itr = Self::parse(token.as_bytes()); let mut itr = Self::parse_bytes(token.as_bytes());
let mut item = itr let mut item = itr
.next() .next()
.ok_or_else(|| failure::format_err!("Invalid ident (none found): {:?}", token))?; .ok_or_else(|| failure::format_err!("Invalid ident (none found): {:?}", token))?;
@ -38,7 +38,18 @@ impl<'t> Identifier<'t> {
Self { token, offset } Self { token, offset }
} }
pub fn parse(content: &[u8]) -> impl Iterator<Item = Identifier<'_>> { pub fn parse(content: &str) -> impl Iterator<Item = Identifier<'_>> {
lazy_static::lazy_static! {
// Getting false positives for this lint
#[allow(clippy::invalid_regex)]
static ref SPLIT: regex::Regex = regex::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap();
}
SPLIT
.find_iter(content)
.map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
}
pub fn parse_bytes(content: &[u8]) -> impl Iterator<Item = Identifier<'_>> {
lazy_static::lazy_static! { lazy_static::lazy_static! {
// Getting false positives for this lint // Getting false positives for this lint
#[allow(clippy::invalid_regex)] #[allow(clippy::invalid_regex)]
@ -240,57 +251,69 @@ mod test {
#[test] #[test]
fn tokenize_empty_is_empty() { fn tokenize_empty_is_empty() {
let input = b""; let input = "";
let expected: Vec<Identifier> = vec![]; let expected: Vec<Identifier> = vec![];
let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = Identifier::parse(input).collect(); let actual: Vec<_> = Identifier::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test] #[test]
fn tokenize_word_is_word() { fn tokenize_word_is_word() {
let input = b"word"; let input = "word";
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)]; let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = Identifier::parse(input).collect(); let actual: Vec<_> = Identifier::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test] #[test]
fn tokenize_space_separated_words() { fn tokenize_space_separated_words() {
let input = b"A B"; let input = "A B";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("A", 0), Identifier::new_unchecked("A", 0),
Identifier::new_unchecked("B", 2), Identifier::new_unchecked("B", 2),
]; ];
let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = Identifier::parse(input).collect(); let actual: Vec<_> = Identifier::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test] #[test]
fn tokenize_dot_separated_words() { fn tokenize_dot_separated_words() {
let input = b"A.B"; let input = "A.B";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("A", 0), Identifier::new_unchecked("A", 0),
Identifier::new_unchecked("B", 2), Identifier::new_unchecked("B", 2),
]; ];
let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = Identifier::parse(input).collect(); let actual: Vec<_> = Identifier::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test] #[test]
fn tokenize_namespace_separated_words() { fn tokenize_namespace_separated_words() {
let input = b"A::B"; let input = "A::B";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("A", 0), Identifier::new_unchecked("A", 0),
Identifier::new_unchecked("B", 3), Identifier::new_unchecked("B", 3),
]; ];
let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = Identifier::parse(input).collect(); let actual: Vec<_> = Identifier::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }
#[test] #[test]
fn tokenize_underscore_doesnt_separate() { fn tokenize_underscore_doesnt_separate() {
let input = b"A_B"; let input = "A_B";
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)]; let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
let actual: Vec<_> = Identifier::parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = Identifier::parse(input).collect(); let actual: Vec<_> = Identifier::parse(input).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
} }