diff --git a/benches/corrections.rs b/benches/corrections.rs index 4c81f6b..bade620 100644 --- a/benches/corrections.rs +++ b/benches/corrections.rs @@ -4,19 +4,19 @@ extern crate test; #[bench] fn load_corrections(b: &mut test::Bencher) { - b.iter(|| scorrect::Corrections::new()); + b.iter(|| scorrect::Dictionary::new()); } #[bench] fn correction(b: &mut test::Bencher) { - let corrections = scorrect::Corrections::new(); + let corrections = scorrect::Dictionary::new(); assert_eq!(corrections.correct_str("successs"), Some("successes")); b.iter(|| corrections.correct_str("successs")); } #[bench] fn no_correction(b: &mut test::Bencher) { - let corrections = scorrect::Corrections::new(); + let corrections = scorrect::Dictionary::new(); assert_eq!(corrections.correct_str("success"), None); b.iter(|| corrections.correct_str("success")); } diff --git a/benches/file.rs b/benches/file.rs index 3547ff8..3056489 100644 --- a/benches/file.rs +++ b/benches/file.rs @@ -12,8 +12,8 @@ fn process_empty(b: &mut test::Bencher) { let sample_path = temp.child("sample"); sample_path.write_str(data::EMPTY).unwrap(); - let corrections = scorrect::Corrections::new(); - b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); + let corrections = scorrect::Dictionary::new(); + b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent)); temp.close().unwrap(); } @@ -24,8 +24,8 @@ fn process_no_tokens(b: &mut test::Bencher) { let sample_path = temp.child("sample"); sample_path.write_str(data::NO_TOKENS).unwrap(); - let corrections = scorrect::Corrections::new(); - b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); + let corrections = scorrect::Dictionary::new(); + b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent)); temp.close().unwrap(); } @@ -36,8 +36,8 @@ fn process_single_token(b: &mut test::Bencher) { let sample_path = temp.child("sample"); sample_path.write_str(data::SINGLE_TOKEN).unwrap(); - let corrections = scorrect::Corrections::new(); - b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); + let corrections = scorrect::Dictionary::new(); + b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent)); temp.close().unwrap(); } @@ -48,8 +48,8 @@ fn process_sherlock(b: &mut test::Bencher) { let sample_path = temp.child("sample"); sample_path.write_str(data::SHERLOCK).unwrap(); - let corrections = scorrect::Corrections::new(); - b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); + let corrections = scorrect::Dictionary::new(); + b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent)); temp.close().unwrap(); } @@ -60,8 +60,8 @@ fn process_code(b: &mut test::Bencher) { let sample_path = temp.child("sample"); sample_path.write_str(data::CODE).unwrap(); - let corrections = scorrect::Corrections::new(); - b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); + let corrections = scorrect::Dictionary::new(); + b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent)); temp.close().unwrap(); } @@ -72,8 +72,8 @@ fn process_corpus(b: &mut test::Bencher) { let sample_path = temp.child("sample"); sample_path.write_str(data::CORPUS).unwrap(); - let corrections = scorrect::Corrections::new(); - b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); + let corrections = scorrect::Dictionary::new(); + b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent)); temp.close().unwrap(); } diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 2914a0a..aeca216 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -6,30 +6,30 @@ mod data; #[bench] fn tokenize_empty(b: &mut test::Bencher) { - b.iter(|| scorrect::tokenize(data::EMPTY.as_bytes()).collect::>()); + b.iter(|| scorrect::identifier::tokenize(data::EMPTY.as_bytes()).collect::>()); } #[bench] fn tokenize_no_tokens(b: &mut test::Bencher) { - b.iter(|| scorrect::tokenize(data::NO_TOKENS.as_bytes()).collect::>()); + b.iter(|| scorrect::identifier::tokenize(data::NO_TOKENS.as_bytes()).collect::>()); } #[bench] fn tokenize_single_token(b: &mut test::Bencher) { - b.iter(|| scorrect::tokenize(data::SINGLE_TOKEN.as_bytes()).collect::>()); + b.iter(|| scorrect::identifier::tokenize(data::SINGLE_TOKEN.as_bytes()).collect::>()); } #[bench] fn tokenize_sherlock(b: &mut test::Bencher) { - b.iter(|| scorrect::tokenize(data::SHERLOCK.as_bytes()).collect::>()); + b.iter(|| scorrect::identifier::tokenize(data::SHERLOCK.as_bytes()).collect::>()); } #[bench] fn tokenize_code(b: &mut test::Bencher) { - b.iter(|| scorrect::tokenize(data::CODE.as_bytes()).collect::>()); + b.iter(|| scorrect::identifier::tokenize(data::CODE.as_bytes()).collect::>()); } #[bench] fn tokenize_corpus(b: &mut test::Bencher) { - b.iter(|| scorrect::tokenize(data::CORPUS.as_bytes()).collect::>()); + b.iter(|| scorrect::identifier::tokenize(data::CORPUS.as_bytes()).collect::>()); } diff --git a/src/dict.rs b/src/dict.rs new file mode 100644 index 0000000..0248925 --- /dev/null +++ b/src/dict.rs @@ -0,0 +1,18 @@ +include!(concat!(env!("OUT_DIR"), "/codegen.rs")); + +pub struct Dictionary { +} + +impl Dictionary { + pub fn new() -> Self { + Dictionary { } + } + + pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> { + DICTIONARY.get(word).map(|s| *s) + } + + pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s [u8]> { + std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| s.as_bytes()) + } +} diff --git a/src/identifier.rs b/src/identifier.rs new file mode 100644 index 0000000..2bd4574 --- /dev/null +++ b/src/identifier.rs @@ -0,0 +1,74 @@ +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token<'t> { + pub token: &'t [u8], + pub offset: usize, +} + +impl<'t> Token<'t> { + pub fn new(token: &'t [u8], offset: usize) -> Self { + Self { + token, + offset, + } + } +} + +pub fn tokenize(content: &[u8]) -> impl Iterator { + lazy_static::lazy_static! { + static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); + } + SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start())) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn tokenize_empty_is_empty() { + let input = b""; + let expected: Vec = vec![]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_word_is_word() { + let input = b"word"; + let expected: Vec = vec![Token::new(b"word", 0)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_space_separated_words() { + let input = b"A B"; + let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_dot_separated_words() { + let input = b"A.B"; + let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_namespace_separated_words() { + let input = b"A::B"; + let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 3)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_underscore_doesnt_separate() { + let input = b"A_B"; + let expected: Vec = vec![Token::new(b"A_B", 0)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } +} diff --git a/src/lib.rs b/src/lib.rs index 19408e8..457c3ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,60 +1,33 @@ #[macro_use] extern crate serde_derive; +mod dict; + +pub mod identifier; +pub mod report; + +pub use crate::dict::*; + use std::fs::File; use std::io::Read; -include!(concat!(env!("OUT_DIR"), "/codegen.rs")); - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Token<'t> { - pub token: &'t [u8], - pub offset: usize, -} - -impl<'t> Token<'t> { - pub fn new(token: &'t [u8], offset: usize) -> Self { - Self { - token, - offset, - } - } -} - -pub fn tokenize(content: &[u8]) -> impl Iterator { - lazy_static::lazy_static! { - static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); - } - SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start())) -} - -#[derive(Debug, Serialize)] -pub struct Message<'m> { - path: &'m std::path::Path, - #[serde(skip)] - line: &'m [u8], - line_num: usize, - col_num: usize, - word: &'m str, - correction: &'m str, -} - -pub fn process_file(path: &std::path::Path, dictionary: &Corrections, report: Report) -> Result<(), failure::Error> { +pub fn process_file(path: &std::path::Path, dictionary: &Dictionary, report: report::Report) -> Result<(), failure::Error> { let mut buffer = Vec::new(); File::open(path)?.read_to_end(&mut buffer)?; for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { let line_num = line_idx + 1; - for token in tokenize(line) { + for token in identifier::tokenize(line) { if let Some(word) = std::str::from_utf8(token.token).ok() { if let Some(correction) = dictionary.correct_str(word) { let col_num = token.offset; - let msg = Message { + let msg = report::Message { path, line, line_num, col_num, word, correction, + non_exhaustive: (), }; report(msg); } @@ -65,100 +38,3 @@ pub fn process_file(path: &std::path::Path, dictionary: &Corrections, report: Re Ok(()) } -pub type Report = fn(msg: Message); - -pub fn print_silent(_: Message) { -} - -pub fn print_brief(msg: Message) { - println!("{}:{}:{}: {} -> {}", msg.path.display(), msg.line_num, msg.col_num, msg.word, msg.correction); -} - -pub fn print_long(msg: Message) { - let line_num = msg.line_num.to_string(); - let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect(); - - let hl_indent: String = itertools::repeat_n(" ", msg.col_num).collect(); - let hl: String = itertools::repeat_n("^", msg.word.len()).collect(); - - println!("error: `{}` should be `{}`", msg.word, msg.correction); - println!(" --> {}:{}:{}", msg.path.display(), msg.line_num, msg.col_num); - println!("{} |", line_indent); - println!("{} | {}", msg.line_num, String::from_utf8_lossy(msg.line).trim_end()); - println!("{} | {}{}", line_indent, hl_indent, hl); - println!("{} |", line_indent); -} - -pub fn print_json(msg: Message) { - println!("{}", serde_json::to_string(&msg).unwrap()); -} - -pub struct Corrections { -} - -impl Corrections { - pub fn new() -> Self { - Corrections { } - } - - pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> { - DICTIONARY.get(word).map(|s| *s) - } - - pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s [u8]> { - std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| s.as_bytes()) - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn tokenize_empty_is_empty() { - let input = b""; - let expected: Vec = vec![]; - let actual: Vec<_> = tokenize(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_word_is_word() { - let input = b"word"; - let expected: Vec = vec![Token::new(b"word", 0)]; - let actual: Vec<_> = tokenize(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_space_separated_words() { - let input = b"A B"; - let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; - let actual: Vec<_> = tokenize(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_dot_separated_words() { - let input = b"A.B"; - let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; - let actual: Vec<_> = tokenize(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_namespace_separated_words() { - let input = b"A::B"; - let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 3)]; - let actual: Vec<_> = tokenize(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_underscore_doesnt_separate() { - let input = b"A_B"; - let expected: Vec = vec![Token::new(b"A_B", 0)]; - let actual: Vec<_> = tokenize(input).collect(); - assert_eq!(expected, actual); - } -} diff --git a/src/main.rs b/src/main.rs index ff28004..edd4485 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,12 +15,12 @@ arg_enum!{ } impl Format { - fn report(self) -> scorrect::Report { + fn report(self) -> scorrect::report::Report { match self { - Format::Silent => scorrect::print_silent, - Format::Brief => scorrect::print_brief, - Format::Long => scorrect::print_long, - Format::Json => scorrect::print_json, + Format::Silent => scorrect::report::print_silent, + Format::Brief => scorrect::report::print_brief, + Format::Long => scorrect::report::print_long, + Format::Json => scorrect::report::print_json, } } } @@ -63,7 +63,7 @@ impl Options { fn run() -> Result<(), failure::Error> { let options = Options::from_args().infer(); - let dictionary = scorrect::Corrections::new(); + let dictionary = scorrect::Dictionary::new(); let first_path = &options.path.get(0).expect("arg parsing enforces at least one"); let mut walk = ignore::WalkBuilder::new(first_path); diff --git a/src/report.rs b/src/report.rs new file mode 100644 index 0000000..3281be6 --- /dev/null +++ b/src/report.rs @@ -0,0 +1,40 @@ +#[derive(Debug, Serialize)] +pub struct Message<'m> { + pub path: &'m std::path::Path, + #[serde(skip)] + pub line: &'m [u8], + pub line_num: usize, + pub col_num: usize, + pub word: &'m str, + pub correction: &'m str, + #[serde(skip)] + pub(crate) non_exhaustive: (), +} + +pub type Report = fn(msg: Message); + +pub fn print_silent(_: Message) { +} + +pub fn print_brief(msg: Message) { + println!("{}:{}:{}: {} -> {}", msg.path.display(), msg.line_num, msg.col_num, msg.word, msg.correction); +} + +pub fn print_long(msg: Message) { + let line_num = msg.line_num.to_string(); + let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect(); + + let hl_indent: String = itertools::repeat_n(" ", msg.col_num).collect(); + let hl: String = itertools::repeat_n("^", msg.word.len()).collect(); + + println!("error: `{}` should be `{}`", msg.word, msg.correction); + println!(" --> {}:{}:{}", msg.path.display(), msg.line_num, msg.col_num); + println!("{} |", line_indent); + println!("{} | {}", msg.line_num, String::from_utf8_lossy(msg.line).trim_end()); + println!("{} | {}{}", line_indent, hl_indent, hl); + println!("{} |", line_indent); +} + +pub fn print_json(msg: Message) { + println!("{}", serde_json::to_string(&msg).unwrap()); +}