From 006204e66a7cd17954010003e03a87e6e8ea933a Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 13 Jul 2019 19:24:27 -0600 Subject: [PATCH 1/3] feat(parser): Ignore hex literals Trying to avoid accidentally correcting something that looks like a word inside a hex number, like `0xBEAF`. Fixes #19 --- benches/file.rs | 6 ++++++ docs/about.md | 3 +-- src/lib.rs | 12 ++++++++++++ src/main.rs | 23 ++++++++++++++++++++++- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/benches/file.rs b/benches/file.rs index 172c820..1520f1b 100644 --- a/benches/file.rs +++ b/benches/file.rs @@ -17,6 +17,7 @@ fn process_empty(b: &mut test::Bencher) { typos::process_file( sample_path.path(), &corrections, + true, typos::report::print_silent, ) }); @@ -35,6 +36,7 @@ fn process_no_tokens(b: &mut test::Bencher) { typos::process_file( sample_path.path(), &corrections, + true, typos::report::print_silent, ) }); @@ -53,6 +55,7 @@ fn process_single_token(b: &mut test::Bencher) { typos::process_file( sample_path.path(), &corrections, + true, typos::report::print_silent, ) }); @@ -71,6 +74,7 @@ fn process_sherlock(b: &mut test::Bencher) { typos::process_file( sample_path.path(), &corrections, + true, typos::report::print_silent, ) }); @@ -89,6 +93,7 @@ fn process_code(b: &mut test::Bencher) { typos::process_file( sample_path.path(), &corrections, + true, typos::report::print_silent, ) }); @@ -107,6 +112,7 @@ fn process_corpus(b: &mut test::Bencher) { typos::process_file( sample_path.path(), &corrections, + true, typos::report::print_silent, ) }); diff --git a/docs/about.md b/docs/about.md index 443e9c3..d06ae8a 100644 --- a/docs/about.md +++ b/docs/about.md @@ -40,7 +40,7 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh | Per-Lang Dict | No ([#14][def-14]) | No | ? | No | Yes | | CamelCase | Yes | No | ? | No | Yes | | snake_case | Yes | No | ? | No | Yes | -| Ignore Hex | No ([#19][def-19]) | No | ? | No | Yes | +| Ignore Hex | Yes | No | ? | No | Yes | | C-Escapes | No ([#20][def-3]) | No | ? | No | Yes | | Encodings | UTF-8 ([#17][def-17]) | UTF-8 | ? | Auto | Auto | | Whole-project | Yes | Yes | Yes | Yes | No | @@ -59,6 +59,5 @@ Whitelist: A confidence rating is given for how close a word is to one in the wh [def-14]: https://github.com/epage/typos/issues/14 [def-17]: https://github.com/epage/typos/issues/17 [def-18]: https://github.com/epage/typos/issues/18 -[def-19]: https://github.com/epage/typos/issues/19 [def-24]: https://github.com/epage/typos/issues/24 [def-3]: https://github.com/epage/typos/issues/3 diff --git a/src/lib.rs b/src/lib.rs index 2c9f1f9..781d4da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ use std::io::Read; pub fn process_file( path: &std::path::Path, dictionary: &Dictionary, + ignore_hex: bool, report: report::Report, ) -> Result<(), failure::Error> { let mut buffer = Vec::new(); @@ -22,6 +23,9 @@ pub fn process_file( for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { let line_num = line_idx + 1; for ident in tokens::Identifier::parse(line) { + if !ignore_hex && is_hex(ident.token()) { + continue; + } if let Some(correction) = dictionary.correct_ident(ident) { let col_num = ident.offset(); let msg = report::Message { @@ -55,3 +59,11 @@ pub fn process_file( Ok(()) } + +fn is_hex(ident: &str) -> bool { + lazy_static::lazy_static! { + // `_`: number literal separator in Rust and other languages + static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_]+$"#).unwrap(); + } + HEX.is_match(ident) +} diff --git a/src/main.rs b/src/main.rs index ff8bce9..e5d3ef8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,6 +38,12 @@ struct Options { /// Paths to check path: Vec, + #[structopt(long, raw(overrides_with = r#""hex""#))] + /// Don't try to detect that an identifier looks like hex + no_hex: bool, + #[structopt(long, raw(overrides_with = r#""no-hex""#), raw(hidden = "true"))] + hex: bool, + #[structopt( long = "format", raw(possible_values = "&Format::variants()", case_insensitive = "true"), @@ -103,6 +109,15 @@ impl Options { self } + pub fn ignore_hex(&self) -> Option { + match (self.no_hex, self.hex) { + (true, false) => Some(false), + (false, true) => Some(true), + (false, false) => None, + (_, _) => unreachable!("StructOpt should make this impossible"), + } + } + pub fn ignore_hidden(&self) -> Option { match (self.hidden, self.no_hidden) { (true, false) => Some(false), @@ -167,6 +182,7 @@ fn run() -> Result<(), failure::Error> { let options = Options::from_args().infer(); let dictionary = typos::Dictionary::new(); + let ignore_hex = options.ignore_hex().unwrap_or(true); let first_path = &options .path @@ -187,7 +203,12 @@ fn run() -> Result<(), failure::Error> { for entry in walk.build() { let entry = entry?; if entry.file_type().map(|t| t.is_file()).unwrap_or(true) { - typos::process_file(entry.path(), &dictionary, options.format.report())?; + typos::process_file( + entry.path(), + &dictionary, + ignore_hex, + options.format.report(), + )?; } } From b6ab9684787a48ce1ef46f429187247368f06d77 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 13 Jul 2019 19:26:37 -0600 Subject: [PATCH 2/3] feat(parser): Treat contractions as a word This should be safe. Rarely is `'` used as syntax in a language that separates literals. - `'` is used within hex literals in C++ but we want to treat them as one word - `'` is used for lifetimes in Rust but there are other symbols on the left side. --- src/tokens.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokens.rs b/src/tokens.rs index 7841f26..9d3727a 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -42,7 +42,7 @@ impl<'t> Identifier<'t> { lazy_static::lazy_static! { // Getting false positives for this lint #[allow(clippy::invalid_regex)] - static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); + static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_|')+\b"#).unwrap(); } SPLIT.find_iter(content).filter_map(|m| { let s = std::str::from_utf8(m.as_bytes()).ok(); From 92a2560c9a203525b4e4c5e15cd8732bd2d64c4c Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 13 Jul 2019 19:28:33 -0600 Subject: [PATCH 3/3] feat(parser): Support C++ hex literal separators --- src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 781d4da..5b425bc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,8 @@ pub fn process_file( fn is_hex(ident: &str) -> bool { lazy_static::lazy_static! { // `_`: number literal separator in Rust and other languages - static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_]+$"#).unwrap(); + // `'`: number literal separator in C++ + static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap(); } HEX.is_match(ident) }