typos/src/lib.rs

#[macro_use]
extern crate serde_derive;

mod dict;
mod dict_codegen;

pub mod report;
pub mod tokens;

pub use crate::dict::*;

use std::fs::File;
use std::io::Read;

use bstr::ByteSlice;

pub fn process_file(
    path: &std::path::Path,
    dictionary: &Dictionary,
    check_filenames: bool,
    ignore_hex: bool,
    binary: bool,
    report: report::Report,
) -> Result<(), failure::Error> {
    if check_filenames {
        for part in path.components().filter_map(|c| c.as_os_str().to_str()) {
            for ident in tokens::Identifier::parse(part) {
                if !ignore_hex && is_hex(ident.token()) {
                    continue;
                }
                if let Some(correction) = dictionary.correct_ident(ident) {
                    let msg = report::FilenameCorrection {
                        path,
                        typo: ident.token(),
                        correction,
                        non_exhaustive: (),
                    };
                    report(msg.into());
                }
                for word in ident.split() {
                    if let Some(correction) = dictionary.correct_word(word) {
                        let msg = report::FilenameCorrection {
                            path,
                            typo: word.token(),
                            correction,
                            non_exhaustive: (),
                        };
                        report(msg.into());
                    }
                }
            }
        }
    }

    let mut buffer = Vec::new();
    File::open(path)?.read_to_end(&mut buffer)?;
    if !binary && buffer.find_byte(b'\0').is_some() {
        let msg = report::BinaryFile {
            path,
            non_exhaustive: (),
        };
        report(msg.into());
        return Ok(());
    }

    for (line_idx, line) in buffer.lines().enumerate() {
        let line_num = line_idx + 1;
        for ident in tokens::Identifier::parse_bytes(line) {
            if !ignore_hex && is_hex(ident.token()) {
                continue;
            }
            if let Some(correction) = dictionary.correct_ident(ident) {
                let col_num = ident.offset();
                let msg = report::Correction {
                    path,
                    line,
                    line_num,
                    col_num,
                    typo: ident.token(),
                    correction,
                    non_exhaustive: (),
                };
                report(msg.into());
            }
            for word in ident.split() {
                if let Some(correction) = dictionary.correct_word(word) {
                    let col_num = word.offset();
                    let msg = report::Correction {
                        path,
                        line,
                        line_num,
                        col_num,
                        typo: word.token(),
                        correction,
                        non_exhaustive: (),
                    };
                    report(msg.into());
                }
            }
        }
    }

    Ok(())
}

fn is_hex(ident: &str) -> bool {
    lazy_static::lazy_static! {
        // `_`: number literal separator in Rust and other languages
        // `'`: number literal separator in C++
        static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();
    }
    HEX.is_match(ident)
}
feat: Control over output format 2019-01-23 09:33:51 -05:00			`#[macro_use]`
			`extern crate serde_derive;`

fix(api): Split lib 2019-01-24 10:24:20 -05:00			`mod dict;`
chore(CI): Fighting clippy 2019-06-14 16:53:34 -04:00			`mod dict_codegen;`
Initial commit 2019-01-22 17:01:33 -05:00
fix(api): Split lib 2019-01-24 10:24:20 -05:00			`pub mod report;`
refactor: Rename module 2019-04-16 22:16:31 -04:00			`pub mod tokens;`
test: Basic tokenization testing 2019-01-23 09:44:01 -05:00
fix(api): Split lib 2019-01-24 10:24:20 -05:00			`pub use crate::dict::*;`
Initial commit 2019-01-22 17:01:33 -05:00
fix(api): Split lib 2019-01-24 10:24:20 -05:00			`use std::fs::File;`
			`use std::io::Read;`
feat: Control over output format 2019-01-23 09:33:51 -05:00
refactor(parser): Switch to bstr for line splitting 2019-07-13 21:52:24 -04:00			`use bstr::ByteSlice;`

chore: Run cargo fmt 2019-06-14 08:43:21 -04:00			`pub fn process_file(`
			`path: &std::path::Path,`
			`dictionary: &Dictionary,`
feat: Check file names Fixes #24 2019-07-18 22:20:45 -04:00			`check_filenames: bool,`
feat(parser): Ignore hex literals Trying to avoid accidentally correcting something that looks like a word inside a hex number, like `0xBEAF`. Fixes #19 2019-07-13 21:24:27 -04:00			`ignore_hex: bool,`
feat: Ignore binary files Fixes #29 2019-07-13 22:14:06 -04:00			`binary: bool,`
chore: Run cargo fmt 2019-06-14 08:43:21 -04:00			`report: report::Report,`
			`) -> Result<(), failure::Error> {`
feat: Check file names Fixes #24 2019-07-18 22:20:45 -04:00			`if check_filenames {`
			`for part in path.components().filter_map(\|c\| c.as_os_str().to_str()) {`
			`for ident in tokens::Identifier::parse(part) {`
			`if !ignore_hex && is_hex(ident.token()) {`
			`continue;`
			`}`
			`if let Some(correction) = dictionary.correct_ident(ident) {`
			`let msg = report::FilenameCorrection {`
			`path,`
			`typo: ident.token(),`
			`correction,`
			`non_exhaustive: (),`
			`};`
			`report(msg.into());`
			`}`
			`for word in ident.split() {`
			`if let Some(correction) = dictionary.correct_word(word) {`
			`let msg = report::FilenameCorrection {`
			`path,`
			`typo: word.token(),`
			`correction,`
			`non_exhaustive: (),`
			`};`
			`report(msg.into());`
			`}`
			`}`
			`}`
			`}`
			`}`

Initial commit 2019-01-22 17:01:33 -05:00			`let mut buffer = Vec::new();`
			`File::open(path)?.read_to_end(&mut buffer)?;`
feat: Ignore binary files Fixes #29 2019-07-13 22:14:06 -04:00			`if !binary && buffer.find_byte(b'\0').is_some() {`
fix: Report binary files to user Fixes #38 2019-07-16 21:16:54 -04:00			`let msg = report::BinaryFile {`
			`path,`
			`non_exhaustive: (),`
			`};`
			`report(msg.into());`
feat: Ignore binary files Fixes #29 2019-07-13 22:14:06 -04:00			`return Ok(());`
			`}`
refactor(parser): Switch to bstr for line splitting 2019-07-13 21:52:24 -04:00
			`for (line_idx, line) in buffer.lines().enumerate() {`
Initial commit 2019-01-22 17:01:33 -05:00			`let line_num = line_idx + 1;`
refactor(parser): Rename bytes-parser 2019-07-16 21:38:54 -04:00			`for ident in tokens::Identifier::parse_bytes(line) {`
feat(parser): Ignore hex literals Trying to avoid accidentally correcting something that looks like a word inside a hex number, like `0xBEAF`. Fixes #19 2019-07-13 21:24:27 -04:00			`if !ignore_hex && is_hex(ident.token()) {`
			`continue;`
			`}`
refactor: Rename Symbol to Identifier This is more descriptive 2019-06-22 13:57:23 -04:00			`if let Some(correction) = dictionary.correct_ident(ident) {`
			`let col_num = ident.offset();`
fix: Report binary files to user Fixes #38 2019-07-16 21:16:54 -04:00			`let msg = report::Correction {`
fix: Improve the quality of symbols being reported 2019-06-14 17:57:41 -04:00			`path,`
			`line,`
			`line_num,`
			`col_num,`
refactor(report): Rename source field 2019-06-23 00:01:27 -04:00			`typo: ident.token(),`
fix: Improve the quality of symbols being reported 2019-06-14 17:57:41 -04:00			`correction,`
			`non_exhaustive: (),`
			`};`
fix: Report binary files to user Fixes #38 2019-07-16 21:16:54 -04:00			`report(msg.into());`
Initial commit 2019-01-22 17:01:33 -05:00			`}`
refactor: Rename Symbol to Identifier This is more descriptive 2019-06-22 13:57:23 -04:00			`for word in ident.split() {`
feat(parse): Process words composing symbols 2019-06-16 00:21:40 -04:00			`if let Some(correction) = dictionary.correct_word(word) {`
			`let col_num = word.offset();`
fix: Report binary files to user Fixes #38 2019-07-16 21:16:54 -04:00			`let msg = report::Correction {`
feat(parse): Process words composing symbols 2019-06-16 00:21:40 -04:00			`path,`
			`line,`
			`line_num,`
			`col_num,`
refactor(report): Rename source field 2019-06-23 00:01:27 -04:00			`typo: word.token(),`
feat(parse): Process words composing symbols 2019-06-16 00:21:40 -04:00			`correction,`
			`non_exhaustive: (),`
			`};`
fix: Report binary files to user Fixes #38 2019-07-16 21:16:54 -04:00			`report(msg.into());`
feat(parse): Process words composing symbols 2019-06-16 00:21:40 -04:00			`}`
			`}`
Initial commit 2019-01-22 17:01:33 -05:00			`}`
			`}`

			`Ok(())`
			`}`
feat(parser): Ignore hex literals Trying to avoid accidentally correcting something that looks like a word inside a hex number, like `0xBEAF`. Fixes #19 2019-07-13 21:24:27 -04:00
			`fn is_hex(ident: &str) -> bool {`
			`lazy_static::lazy_static! {`
			// `_`: number literal separator in Rust and other languages
feat(parser): Support C++ hex literal separators 2019-07-13 21:28:33 -04:00			// `'`: number literal separator in C++
			`static ref HEX: regex::Regex = regex::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap();`
feat(parser): Ignore hex literals Trying to avoid accidentally correcting something that looks like a word inside a hex number, like `0xBEAF`. Fixes #19 2019-07-13 21:24:27 -04:00			`}`
			`HEX.is_match(ident)`
			`}`