fix(api): Split lib

This commit is contained in:
Ed Page 2019-01-24 08:24:20 -07:00
parent d8ca9f9d5a
commit 85ee5cfac9
8 changed files with 170 additions and 162 deletions

View file

@ -4,19 +4,19 @@ extern crate test;
#[bench] #[bench]
fn load_corrections(b: &mut test::Bencher) { fn load_corrections(b: &mut test::Bencher) {
b.iter(|| scorrect::Corrections::new()); b.iter(|| scorrect::Dictionary::new());
} }
#[bench] #[bench]
fn correction(b: &mut test::Bencher) { fn correction(b: &mut test::Bencher) {
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
assert_eq!(corrections.correct_str("successs"), Some("successes")); assert_eq!(corrections.correct_str("successs"), Some("successes"));
b.iter(|| corrections.correct_str("successs")); b.iter(|| corrections.correct_str("successs"));
} }
#[bench] #[bench]
fn no_correction(b: &mut test::Bencher) { fn no_correction(b: &mut test::Bencher) {
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
assert_eq!(corrections.correct_str("success"), None); assert_eq!(corrections.correct_str("success"), None);
b.iter(|| corrections.correct_str("success")); b.iter(|| corrections.correct_str("success"));
} }

View file

@ -12,8 +12,8 @@ fn process_empty(b: &mut test::Bencher) {
let sample_path = temp.child("sample"); let sample_path = temp.child("sample");
sample_path.write_str(data::EMPTY).unwrap(); sample_path.write_str(data::EMPTY).unwrap();
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent));
temp.close().unwrap(); temp.close().unwrap();
} }
@ -24,8 +24,8 @@ fn process_no_tokens(b: &mut test::Bencher) {
let sample_path = temp.child("sample"); let sample_path = temp.child("sample");
sample_path.write_str(data::NO_TOKENS).unwrap(); sample_path.write_str(data::NO_TOKENS).unwrap();
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent));
temp.close().unwrap(); temp.close().unwrap();
} }
@ -36,8 +36,8 @@ fn process_single_token(b: &mut test::Bencher) {
let sample_path = temp.child("sample"); let sample_path = temp.child("sample");
sample_path.write_str(data::SINGLE_TOKEN).unwrap(); sample_path.write_str(data::SINGLE_TOKEN).unwrap();
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent));
temp.close().unwrap(); temp.close().unwrap();
} }
@ -48,8 +48,8 @@ fn process_sherlock(b: &mut test::Bencher) {
let sample_path = temp.child("sample"); let sample_path = temp.child("sample");
sample_path.write_str(data::SHERLOCK).unwrap(); sample_path.write_str(data::SHERLOCK).unwrap();
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent));
temp.close().unwrap(); temp.close().unwrap();
} }
@ -60,8 +60,8 @@ fn process_code(b: &mut test::Bencher) {
let sample_path = temp.child("sample"); let sample_path = temp.child("sample");
sample_path.write_str(data::CODE).unwrap(); sample_path.write_str(data::CODE).unwrap();
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent));
temp.close().unwrap(); temp.close().unwrap();
} }
@ -72,8 +72,8 @@ fn process_corpus(b: &mut test::Bencher) {
let sample_path = temp.child("sample"); let sample_path = temp.child("sample");
sample_path.write_str(data::CORPUS).unwrap(); sample_path.write_str(data::CORPUS).unwrap();
let corrections = scorrect::Corrections::new(); let corrections = scorrect::Dictionary::new();
b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::print_silent)); b.iter(|| scorrect::process_file(sample_path.path(), &corrections, scorrect::report::print_silent));
temp.close().unwrap(); temp.close().unwrap();
} }

View file

@ -6,30 +6,30 @@ mod data;
#[bench] #[bench]
fn tokenize_empty(b: &mut test::Bencher) { fn tokenize_empty(b: &mut test::Bencher) {
b.iter(|| scorrect::tokenize(data::EMPTY.as_bytes()).collect::<Vec<_>>()); b.iter(|| scorrect::identifier::tokenize(data::EMPTY.as_bytes()).collect::<Vec<_>>());
} }
#[bench] #[bench]
fn tokenize_no_tokens(b: &mut test::Bencher) { fn tokenize_no_tokens(b: &mut test::Bencher) {
b.iter(|| scorrect::tokenize(data::NO_TOKENS.as_bytes()).collect::<Vec<_>>()); b.iter(|| scorrect::identifier::tokenize(data::NO_TOKENS.as_bytes()).collect::<Vec<_>>());
} }
#[bench] #[bench]
fn tokenize_single_token(b: &mut test::Bencher) { fn tokenize_single_token(b: &mut test::Bencher) {
b.iter(|| scorrect::tokenize(data::SINGLE_TOKEN.as_bytes()).collect::<Vec<_>>()); b.iter(|| scorrect::identifier::tokenize(data::SINGLE_TOKEN.as_bytes()).collect::<Vec<_>>());
} }
#[bench] #[bench]
fn tokenize_sherlock(b: &mut test::Bencher) { fn tokenize_sherlock(b: &mut test::Bencher) {
b.iter(|| scorrect::tokenize(data::SHERLOCK.as_bytes()).collect::<Vec<_>>()); b.iter(|| scorrect::identifier::tokenize(data::SHERLOCK.as_bytes()).collect::<Vec<_>>());
} }
#[bench] #[bench]
fn tokenize_code(b: &mut test::Bencher) { fn tokenize_code(b: &mut test::Bencher) {
b.iter(|| scorrect::tokenize(data::CODE.as_bytes()).collect::<Vec<_>>()); b.iter(|| scorrect::identifier::tokenize(data::CODE.as_bytes()).collect::<Vec<_>>());
} }
#[bench] #[bench]
fn tokenize_corpus(b: &mut test::Bencher) { fn tokenize_corpus(b: &mut test::Bencher) {
b.iter(|| scorrect::tokenize(data::CORPUS.as_bytes()).collect::<Vec<_>>()); b.iter(|| scorrect::identifier::tokenize(data::CORPUS.as_bytes()).collect::<Vec<_>>());
} }

18
src/dict.rs Normal file
View file

@ -0,0 +1,18 @@
include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
pub struct Dictionary {
}
impl Dictionary {
pub fn new() -> Self {
Dictionary { }
}
pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> {
DICTIONARY.get(word).map(|s| *s)
}
pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s [u8]> {
std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| s.as_bytes())
}
}

74
src/identifier.rs Normal file
View file

@ -0,0 +1,74 @@
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<'t> {
pub token: &'t [u8],
pub offset: usize,
}
impl<'t> Token<'t> {
pub fn new(token: &'t [u8], offset: usize) -> Self {
Self {
token,
offset,
}
}
}
pub fn tokenize(content: &[u8]) -> impl Iterator<Item=Token> {
lazy_static::lazy_static! {
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
}
SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start()))
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn tokenize_empty_is_empty() {
let input = b"";
let expected: Vec<Token> = vec![];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_word_is_word() {
let input = b"word";
let expected: Vec<Token> = vec![Token::new(b"word", 0)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_space_separated_words() {
let input = b"A B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_dot_separated_words() {
let input = b"A.B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_namespace_separated_words() {
let input = b"A::B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 3)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_underscore_doesnt_separate() {
let input = b"A_B";
let expected: Vec<Token> = vec![Token::new(b"A_B", 0)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
}

View file

@ -1,60 +1,33 @@
#[macro_use] #[macro_use]
extern crate serde_derive; extern crate serde_derive;
mod dict;
pub mod identifier;
pub mod report;
pub use crate::dict::*;
use std::fs::File; use std::fs::File;
use std::io::Read; use std::io::Read;
include!(concat!(env!("OUT_DIR"), "/codegen.rs")); pub fn process_file(path: &std::path::Path, dictionary: &Dictionary, report: report::Report) -> Result<(), failure::Error> {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<'t> {
pub token: &'t [u8],
pub offset: usize,
}
impl<'t> Token<'t> {
pub fn new(token: &'t [u8], offset: usize) -> Self {
Self {
token,
offset,
}
}
}
pub fn tokenize(content: &[u8]) -> impl Iterator<Item=Token> {
lazy_static::lazy_static! {
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
}
SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start()))
}
#[derive(Debug, Serialize)]
pub struct Message<'m> {
path: &'m std::path::Path,
#[serde(skip)]
line: &'m [u8],
line_num: usize,
col_num: usize,
word: &'m str,
correction: &'m str,
}
pub fn process_file(path: &std::path::Path, dictionary: &Corrections, report: Report) -> Result<(), failure::Error> {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
File::open(path)?.read_to_end(&mut buffer)?; File::open(path)?.read_to_end(&mut buffer)?;
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
let line_num = line_idx + 1; let line_num = line_idx + 1;
for token in tokenize(line) { for token in identifier::tokenize(line) {
if let Some(word) = std::str::from_utf8(token.token).ok() { if let Some(word) = std::str::from_utf8(token.token).ok() {
if let Some(correction) = dictionary.correct_str(word) { if let Some(correction) = dictionary.correct_str(word) {
let col_num = token.offset; let col_num = token.offset;
let msg = Message { let msg = report::Message {
path, path,
line, line,
line_num, line_num,
col_num, col_num,
word, word,
correction, correction,
non_exhaustive: (),
}; };
report(msg); report(msg);
} }
@ -65,100 +38,3 @@ pub fn process_file(path: &std::path::Path, dictionary: &Corrections, report: Re
Ok(()) Ok(())
} }
pub type Report = fn(msg: Message);
pub fn print_silent(_: Message) {
}
pub fn print_brief(msg: Message) {
println!("{}:{}:{}: {} -> {}", msg.path.display(), msg.line_num, msg.col_num, msg.word, msg.correction);
}
pub fn print_long(msg: Message) {
let line_num = msg.line_num.to_string();
let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect();
let hl_indent: String = itertools::repeat_n(" ", msg.col_num).collect();
let hl: String = itertools::repeat_n("^", msg.word.len()).collect();
println!("error: `{}` should be `{}`", msg.word, msg.correction);
println!(" --> {}:{}:{}", msg.path.display(), msg.line_num, msg.col_num);
println!("{} |", line_indent);
println!("{} | {}", msg.line_num, String::from_utf8_lossy(msg.line).trim_end());
println!("{} | {}{}", line_indent, hl_indent, hl);
println!("{} |", line_indent);
}
pub fn print_json(msg: Message) {
println!("{}", serde_json::to_string(&msg).unwrap());
}
pub struct Corrections {
}
impl Corrections {
pub fn new() -> Self {
Corrections { }
}
pub fn correct_str<'s>(&'s self, word: &str) -> Option<&'s str> {
DICTIONARY.get(word).map(|s| *s)
}
pub fn correct_bytes<'s>(&'s self, word: &[u8]) -> Option<&'s [u8]> {
std::str::from_utf8(word).ok().and_then(|word| DICTIONARY.get(word)).map(|s| s.as_bytes())
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn tokenize_empty_is_empty() {
let input = b"";
let expected: Vec<Token> = vec![];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_word_is_word() {
let input = b"word";
let expected: Vec<Token> = vec![Token::new(b"word", 0)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_space_separated_words() {
let input = b"A B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_dot_separated_words() {
let input = b"A.B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_namespace_separated_words() {
let input = b"A::B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 3)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_underscore_doesnt_separate() {
let input = b"A_B";
let expected: Vec<Token> = vec![Token::new(b"A_B", 0)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
}

View file

@ -15,12 +15,12 @@ arg_enum!{
} }
impl Format { impl Format {
fn report(self) -> scorrect::Report { fn report(self) -> scorrect::report::Report {
match self { match self {
Format::Silent => scorrect::print_silent, Format::Silent => scorrect::report::print_silent,
Format::Brief => scorrect::print_brief, Format::Brief => scorrect::report::print_brief,
Format::Long => scorrect::print_long, Format::Long => scorrect::report::print_long,
Format::Json => scorrect::print_json, Format::Json => scorrect::report::print_json,
} }
} }
} }
@ -63,7 +63,7 @@ impl Options {
fn run() -> Result<(), failure::Error> { fn run() -> Result<(), failure::Error> {
let options = Options::from_args().infer(); let options = Options::from_args().infer();
let dictionary = scorrect::Corrections::new(); let dictionary = scorrect::Dictionary::new();
let first_path = &options.path.get(0).expect("arg parsing enforces at least one"); let first_path = &options.path.get(0).expect("arg parsing enforces at least one");
let mut walk = ignore::WalkBuilder::new(first_path); let mut walk = ignore::WalkBuilder::new(first_path);

40
src/report.rs Normal file
View file

@ -0,0 +1,40 @@
#[derive(Debug, Serialize)]
pub struct Message<'m> {
pub path: &'m std::path::Path,
#[serde(skip)]
pub line: &'m [u8],
pub line_num: usize,
pub col_num: usize,
pub word: &'m str,
pub correction: &'m str,
#[serde(skip)]
pub(crate) non_exhaustive: (),
}
pub type Report = fn(msg: Message);
pub fn print_silent(_: Message) {
}
pub fn print_brief(msg: Message) {
println!("{}:{}:{}: {} -> {}", msg.path.display(), msg.line_num, msg.col_num, msg.word, msg.correction);
}
pub fn print_long(msg: Message) {
let line_num = msg.line_num.to_string();
let line_indent: String = itertools::repeat_n(" ", line_num.len()).collect();
let hl_indent: String = itertools::repeat_n(" ", msg.col_num).collect();
let hl: String = itertools::repeat_n("^", msg.word.len()).collect();
println!("error: `{}` should be `{}`", msg.word, msg.correction);
println!(" --> {}:{}:{}", msg.path.display(), msg.line_num, msg.col_num);
println!("{} |", line_indent);
println!("{} | {}", msg.line_num, String::from_utf8_lossy(msg.line).trim_end());
println!("{} | {}{}", line_indent, hl_indent, hl);
println!("{} |", line_indent);
}
pub fn print_json(msg: Message) {
println!("{}", serde_json::to_string(&msg).unwrap());
}