mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-22 00:51:11 -05:00
refactor(parser): Move hex handling to parser
This commit is contained in:
parent
d0b9979c36
commit
3cf9d8672c
2 changed files with 61 additions and 21 deletions
17
src/lib.rs
17
src/lib.rs
|
@ -23,15 +23,12 @@ pub fn process_file(
|
||||||
binary: bool,
|
binary: bool,
|
||||||
report: report::Report,
|
report: report::Report,
|
||||||
) -> Result<bool, failure::Error> {
|
) -> Result<bool, failure::Error> {
|
||||||
let parser = tokens::Parser::new();
|
let parser = tokens::ParserBuilder::new().ignore_hex(ignore_hex).build();
|
||||||
let mut typos_found = false;
|
let mut typos_found = false;
|
||||||
|
|
||||||
if check_filenames {
|
if check_filenames {
|
||||||
for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
|
for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
|
||||||
for ident in parser.parse(part) {
|
for ident in parser.parse(part) {
|
||||||
if !ignore_hex && is_hex(ident.token()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if let Some(correction) = dictionary.correct_ident(ident) {
|
if let Some(correction) = dictionary.correct_ident(ident) {
|
||||||
let msg = report::FilenameCorrection {
|
let msg = report::FilenameCorrection {
|
||||||
path,
|
path,
|
||||||
|
@ -73,9 +70,6 @@ pub fn process_file(
|
||||||
for (line_idx, line) in buffer.lines().enumerate() {
|
for (line_idx, line) in buffer.lines().enumerate() {
|
||||||
let line_num = line_idx + 1;
|
let line_num = line_idx + 1;
|
||||||
for ident in parser.parse_bytes(line) {
|
for ident in parser.parse_bytes(line) {
|
||||||
if !ignore_hex && is_hex(ident.token()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if let Some(correction) = dictionary.correct_ident(ident) {
|
if let Some(correction) = dictionary.correct_ident(ident) {
|
||||||
let col_num = ident.offset();
|
let col_num = ident.offset();
|
||||||
let msg = report::Correction {
|
let msg = report::Correction {
|
||||||
|
@ -112,12 +106,3 @@ pub fn process_file(
|
||||||
|
|
||||||
Ok(typos_found)
|
Ok(typos_found)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_hex(ident: &str) -> bool {
|
|
||||||
lazy_static::lazy_static! {
|
|
||||||
// `_`: number literal separator in Rust and other languages
|
|
||||||
// `'`: number literal separator in C++
|
|
||||||
static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
|
|
||||||
}
|
|
||||||
HEX.is_match(ident)
|
|
||||||
}
|
|
||||||
|
|
|
@ -7,13 +7,20 @@ pub enum Case {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone, Default)]
|
||||||
pub struct ParserBuilder {}
|
pub struct ParserBuilder {
|
||||||
|
ignore_hex: bool,
|
||||||
|
}
|
||||||
|
|
||||||
impl ParserBuilder {
|
impl ParserBuilder {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Default::default()
|
Default::default()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn ignore_hex(mut self, yes: bool) -> Self {
|
||||||
|
self.ignore_hex = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> Parser {
|
pub fn build(self) -> Parser {
|
||||||
let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#;
|
let pattern = r#"\b(\p{Alphabetic}|\d|_|')+\b"#;
|
||||||
let words_str = regex::Regex::new(pattern).unwrap();
|
let words_str = regex::Regex::new(pattern).unwrap();
|
||||||
|
@ -21,6 +28,7 @@ impl ParserBuilder {
|
||||||
Parser {
|
Parser {
|
||||||
words_str,
|
words_str,
|
||||||
words_bytes,
|
words_bytes,
|
||||||
|
ignore_hex: self.ignore_hex,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,6 +37,7 @@ impl ParserBuilder {
|
||||||
pub struct Parser {
|
pub struct Parser {
|
||||||
words_str: regex::Regex,
|
words_str: regex::Regex,
|
||||||
words_bytes: regex::bytes::Regex,
|
words_bytes: regex::bytes::Regex,
|
||||||
|
ignore_hex: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Parser {
|
impl Parser {
|
||||||
|
@ -37,13 +46,19 @@ impl Parser {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
|
let ignore_hex = self.ignore_hex;
|
||||||
self.words_str
|
self.words_str
|
||||||
.find_iter(content)
|
.find_iter(content)
|
||||||
|
.filter(move |m| !ignore_hex || !is_hex(m.as_str().as_bytes()))
|
||||||
.map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
|
.map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
self.words_bytes.find_iter(content).filter_map(|m| {
|
let ignore_hex = self.ignore_hex;
|
||||||
|
self.words_bytes
|
||||||
|
.find_iter(content)
|
||||||
|
.filter(move |m| !ignore_hex || !is_hex(m.as_bytes()))
|
||||||
|
.filter_map(|m| {
|
||||||
let s = std::str::from_utf8(m.as_bytes()).ok();
|
let s = std::str::from_utf8(m.as_bytes()).ok();
|
||||||
s.map(|s| Identifier::new_unchecked(s, m.start()))
|
s.map(|s| Identifier::new_unchecked(s, m.start()))
|
||||||
})
|
})
|
||||||
|
@ -56,6 +71,15 @@ impl Default for Parser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_hex(ident: &[u8]) -> bool {
|
||||||
|
lazy_static::lazy_static! {
|
||||||
|
// `_`: number literal separator in Rust and other languages
|
||||||
|
// `'`: number literal separator in C++
|
||||||
|
static ref HEX: regex::bytes::Regex = regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
|
||||||
|
}
|
||||||
|
HEX.is_match(ident)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub struct Identifier<'t> {
|
pub struct Identifier<'t> {
|
||||||
token: &'t str,
|
token: &'t str,
|
||||||
|
@ -335,6 +359,37 @@ mod test {
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_ignore_hex_enabled() {
|
||||||
|
let parser = ParserBuilder::new().ignore_hex(true).build();
|
||||||
|
|
||||||
|
let input = "Hello 0xDEADBEEF World";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Hello", 0),
|
||||||
|
Identifier::new_unchecked("World", 17),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_ignore_hex_disabled() {
|
||||||
|
let parser = ParserBuilder::new().ignore_hex(false).build();
|
||||||
|
|
||||||
|
let input = "Hello 0xDEADBEEF World";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Hello", 0),
|
||||||
|
Identifier::new_unchecked("0xDEADBEEF", 6),
|
||||||
|
Identifier::new_unchecked("World", 17),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn split_ident() {
|
fn split_ident() {
|
||||||
let cases = [
|
let cases = [
|
||||||
|
|
Loading…
Reference in a new issue