From 663eb94d32a980ede97039a67cbe416e736f06b1 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 31 Dec 2020 17:41:32 -0600 Subject: [PATCH] refactor: Switch Typos to check_file --- benches/checks.rs | 236 +++++++++++++++--------------- src/checks.rs | 358 ++++++++++++---------------------------------- 2 files changed, 212 insertions(+), 382 deletions(-) diff --git a/benches/checks.rs b/benches/checks.rs index 3379392..dcb8dc2 100644 --- a/benches/checks.rs +++ b/benches/checks.rs @@ -7,129 +7,178 @@ mod data; use assert_fs::prelude::*; use typos_cli::checks::Check; -fn bench_parse_ident_str(data: &str, b: &mut test::Bencher) { +fn bench_files(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); let parser = typos::tokens::Tokenizer::new(); - let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser(); - b.iter(|| checks.check_str(data, &parser, &corrections, &typos_cli::report::PrintSilent)); -} - -#[bench] -fn parse_idents_empty_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::EMPTY, b); -} - -#[bench] -fn parse_idents_no_tokens_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::NO_TOKENS, b); -} - -#[bench] -fn parse_idents_single_token_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::SINGLE_TOKEN, b); -} - -#[bench] -fn parse_idents_sherlock_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::SHERLOCK, b); -} - -#[bench] -fn parse_idents_code_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::CODE, b); -} - -#[bench] -fn parse_idents_corpus_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::CORPUS, b); -} - -fn bench_parse_ident_bytes(data: &str, b: &mut test::Bencher) { - let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Tokenizer::new(); - let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser(); + let checks = typos_cli::checks::TyposSettings::new().build_files(); b.iter(|| { - checks.check_bytes( - data.as_bytes(), + checks.check_file( + sample_path.path(), + true, &parser, &corrections, &typos_cli::report::PrintSilent, ) }); + + temp.close().unwrap(); } #[bench] -fn parse_idents_empty_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::EMPTY, b); +fn files_empty(b: &mut test::Bencher) { + bench_files(data::EMPTY, b); } #[bench] -fn parse_idents_no_tokens_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::NO_TOKENS, b); +fn files_no_tokens(b: &mut test::Bencher) { + bench_files(data::NO_TOKENS, b); } #[bench] -fn parse_idents_single_token_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::SINGLE_TOKEN, b); +fn files_single_token(b: &mut test::Bencher) { + bench_files(data::SINGLE_TOKEN, b); } #[bench] -fn parse_idents_sherlock_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::SHERLOCK, b); +fn files_sherlock(b: &mut test::Bencher) { + bench_files(data::SHERLOCK, b); } #[bench] -fn parse_idents_code_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::CODE, b); +fn files_code(b: &mut test::Bencher) { + bench_files(data::CODE, b); } #[bench] -fn parse_idents_corpus_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::CORPUS, b); +fn files_corpus(b: &mut test::Bencher) { + bench_files(data::CORPUS, b); } -fn bench_parse_word_str(data: &str, b: &mut test::Bencher) { +fn bench_identifiers(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); + let parser = typos::tokens::Tokenizer::new(); + let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser(); + b.iter(|| { + checks.check_file( + sample_path.path(), + true, + &parser, + &corrections, + &typos_cli::report::PrintSilent, + ) + }); + + temp.close().unwrap(); +} + +#[bench] +fn identifiers_empty(b: &mut test::Bencher) { + bench_identifiers(data::EMPTY, b); +} + +#[bench] +fn identifiers_no_tokens(b: &mut test::Bencher) { + bench_identifiers(data::NO_TOKENS, b); +} + +#[bench] +fn identifiers_single_token(b: &mut test::Bencher) { + bench_identifiers(data::SINGLE_TOKEN, b); +} + +#[bench] +fn identifiers_sherlock(b: &mut test::Bencher) { + bench_identifiers(data::SHERLOCK, b); +} + +#[bench] +fn identifiers_code(b: &mut test::Bencher) { + bench_identifiers(data::CODE, b); +} + +#[bench] +fn identifiers_corpus(b: &mut test::Bencher) { + bench_identifiers(data::CORPUS, b); +} + +fn bench_words(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); let parser = typos::tokens::Tokenizer::new(); let checks = typos_cli::checks::TyposSettings::new().build_word_parser(); - b.iter(|| checks.check_str(data, &parser, &corrections, &typos_cli::report::PrintSilent)); + b.iter(|| { + checks.check_file( + sample_path.path(), + true, + &parser, + &corrections, + &typos_cli::report::PrintSilent, + ) + }); + + temp.close().unwrap(); } #[bench] -fn parse_words_empty(b: &mut test::Bencher) { - bench_parse_word_str(data::EMPTY, b); +fn words_empty(b: &mut test::Bencher) { + bench_words(data::EMPTY, b); } #[bench] -fn parse_words_no_tokens(b: &mut test::Bencher) { - bench_parse_word_str(data::NO_TOKENS, b); +fn words_no_tokens(b: &mut test::Bencher) { + bench_words(data::NO_TOKENS, b); } #[bench] -fn parse_words_single_token(b: &mut test::Bencher) { - bench_parse_word_str(data::SINGLE_TOKEN, b); +fn words_single_token(b: &mut test::Bencher) { + bench_words(data::SINGLE_TOKEN, b); } #[bench] -fn parse_words_sherlock(b: &mut test::Bencher) { - bench_parse_word_str(data::SHERLOCK, b); +fn words_sherlock(b: &mut test::Bencher) { + bench_words(data::SHERLOCK, b); } #[bench] -fn parse_words_code(b: &mut test::Bencher) { - bench_parse_word_str(data::CODE, b); +fn words_code(b: &mut test::Bencher) { + bench_words(data::CODE, b); } #[bench] -fn parse_words_corpus(b: &mut test::Bencher) { - bench_parse_word_str(data::CORPUS, b); +fn words_corpus(b: &mut test::Bencher) { + bench_words(data::CORPUS, b); } fn bench_typos(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); let parser = typos::tokens::Tokenizer::new(); let checks = typos_cli::checks::TyposSettings::new().build_typos(); - b.iter(|| checks.check_str(data, &parser, &corrections, &typos_cli::report::PrintSilent)); + b.iter(|| { + checks.check_file( + sample_path.path(), + true, + &parser, + &corrections, + &typos_cli::report::PrintSilent, + ) + }); + + temp.close().unwrap(); } #[bench] @@ -161,54 +210,3 @@ fn typos_code(b: &mut test::Bencher) { fn typos_corpus(b: &mut test::Bencher) { bench_typos(data::CORPUS, b); } - -fn bench_check_file(data: &str, b: &mut test::Bencher) { - let temp = assert_fs::TempDir::new().unwrap(); - let sample_path = temp.child("sample"); - sample_path.write_str(data).unwrap(); - - let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Tokenizer::new(); - let checks = typos_cli::checks::TyposSettings::new().build_typos(); - b.iter(|| { - checks.check_file_content( - sample_path.path(), - true, - &parser, - &corrections, - &typos_cli::report::PrintSilent, - ) - }); - - temp.close().unwrap(); -} - -#[bench] -fn check_file_empty(b: &mut test::Bencher) { - bench_check_file(data::EMPTY, b); -} - -#[bench] -fn check_file_no_tokens(b: &mut test::Bencher) { - bench_check_file(data::NO_TOKENS, b); -} - -#[bench] -fn check_file_single_token(b: &mut test::Bencher) { - bench_check_file(data::SINGLE_TOKEN, b); -} - -#[bench] -fn check_file_sherlock(b: &mut test::Bencher) { - bench_check_file(data::SHERLOCK, b); -} - -#[bench] -fn check_file_code(b: &mut test::Bencher) { - bench_check_file(data::CODE, b); -} - -#[bench] -fn check_file_corpus(b: &mut test::Bencher) { - bench_check_file(data::CORPUS, b); -} diff --git a/src/checks.rs b/src/checks.rs index ea8e5cb..65d7e48 100644 --- a/src/checks.rs +++ b/src/checks.rs @@ -5,82 +5,6 @@ use typos::tokens; use typos::Dictionary; pub trait Check: Send + Sync { - fn check_str( - &self, - buffer: &str, - parser: &tokens::Tokenizer, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error>; - - fn check_bytes( - &self, - buffer: &[u8], - parser: &tokens::Tokenizer, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error>; - - fn check_filenames(&self) -> bool; - - fn check_files(&self) -> bool; - - fn binary(&self) -> bool; - - fn check_filename( - &self, - path: &std::path::Path, - parser: &tokens::Tokenizer, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - if !self.check_filenames() { - return Ok(()); - } - - if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { - let context_reporter = ReportContext { - reporter, - context: report::PathContext { path }.into(), - }; - self.check_str(file_name, parser, dictionary, &context_reporter)?; - } - - Ok(()) - } - - fn check_file_content( - &self, - path: &std::path::Path, - explicit: bool, - parser: &tokens::Tokenizer, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - if !self.check_files() { - return Ok(()); - } - - let buffer = read_file(path, reporter)?; - let (buffer, content_type) = massage_data(buffer)?; - if !explicit && !self.binary() && content_type.is_binary() { - let msg = report::BinaryFile { path }; - reporter.report(msg.into())?; - return Ok(()); - } - - for (line_idx, line) in buffer.lines().enumerate() { - let line_num = line_idx + 1; - let context_reporter = ReportContext { - reporter, - context: report::FileContext { path, line_num }.into(), - }; - self.check_bytes(line, parser, dictionary, &context_reporter)?; - } - - Ok(()) - } - fn check_file( &self, path: &std::path::Path, @@ -88,23 +12,7 @@ pub trait Check: Send + Sync { parser: &tokens::Tokenizer, dictionary: &dyn Dictionary, reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - self.check_filename(path, parser, dictionary, reporter)?; - self.check_file_content(path, explicit, parser, dictionary, reporter)?; - Ok(()) - } -} - -struct ReportContext<'m, 'r> { - reporter: &'r dyn report::Report, - context: report::Context<'m>, -} - -impl<'m, 'r> report::Report for ReportContext<'m, 'r> { - fn report(&self, msg: report::Message) -> Result<(), std::io::Error> { - let msg = msg.context(Some(self.context.clone())); - self.reporter.report(msg) - } + ) -> Result<(), std::io::Error>; } #[derive(Debug, Clone, PartialEq, Eq)] @@ -183,9 +91,10 @@ pub struct Typos { } impl Check for Typos { - fn check_str( + fn check_file( &self, - buffer: &str, + path: &std::path::Path, + explicit: bool, tokenizer: &tokens::Tokenizer, dictionary: &dyn Dictionary, reporter: &dyn report::Report, @@ -194,54 +103,47 @@ impl Check for Typos { .tokenizer(tokenizer) .dictionary(dictionary) .typos(); - for typo in parser.parse_str(buffer) { - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), - byte_offset: typo.byte_offset, - typo: typo.typo, - corrections: typo.corrections, - }; - reporter.report(msg.into())?; + + if self.check_filenames { + if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { + for typo in parser.parse_str(file_name) { + let msg = report::Typo { + context: Some(report::PathContext { path }.into()), + buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()), + byte_offset: typo.byte_offset, + typo: typo.typo, + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } } - Ok(()) - } - fn check_bytes( - &self, - buffer: &[u8], - tokenizer: &tokens::Tokenizer, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - let parser = typos::ParserBuilder::new() - .tokenizer(tokenizer) - .dictionary(dictionary) - .typos(); - for typo in parser.parse_bytes(buffer) { - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), - byte_offset: typo.byte_offset, - typo: typo.typo, - corrections: typo.corrections, - }; - reporter.report(msg.into())?; + if self.check_files { + let buffer = read_file(path, reporter)?; + let (buffer, content_type) = massage_data(buffer)?; + if !explicit && !self.binary && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + let mut accum_line_num = AccumulateLineNum::new(); + for typo in parser.parse_bytes(&buffer) { + let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); + let (line, line_offset) = extract_line(&buffer, typo.byte_offset); + let msg = report::Typo { + context: Some(report::FileContext { path, line_num }.into()), + buffer: std::borrow::Cow::Borrowed(line), + byte_offset: line_offset, + typo: typo.typo, + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } } + Ok(()) } - - fn check_filenames(&self) -> bool { - self.check_filenames - } - - fn check_files(&self) -> bool { - self.check_files - } - - fn binary(&self) -> bool { - self.binary - } } #[derive(Debug, Clone)] @@ -252,26 +154,6 @@ pub struct Identifiers { } impl Check for Identifiers { - fn check_str( - &self, - _buffer: &str, - _tokenizer: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_bytes( - &self, - _buffer: &[u8], - _tokenizer: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - fn check_file( &self, path: &std::path::Path, @@ -284,7 +166,7 @@ impl Check for Identifiers { .tokenizer(tokenizer) .identifiers(); - if self.check_filenames() { + if self.check_filenames { if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { for word in parser.parse_str(file_name) { let msg = report::Parse { @@ -297,16 +179,20 @@ impl Check for Identifiers { } } - if self.check_files() { + if self.check_files { let buffer = read_file(path, reporter)?; let (buffer, content_type) = massage_data(buffer)?; - if !explicit && !self.binary() && content_type.is_binary() { + if !explicit && !self.binary && content_type.is_binary() { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { for word in parser.parse_bytes(&buffer) { + // HACK: Don't look up the line_num per entry to better match the performance + // of Typos for comparison purposes. We don't really get much out of it + // anyway. + let line_num = 0; let msg = report::Parse { - context: Some(report::FileContext { path, line_num: 0 }.into()), + context: Some(report::FileContext { path, line_num }.into()), kind: report::ParseKind::Identifier, data: word.token(), }; @@ -317,18 +203,6 @@ impl Check for Identifiers { Ok(()) } - - fn check_filenames(&self) -> bool { - self.check_filenames - } - - fn check_files(&self) -> bool { - self.check_files - } - - fn binary(&self) -> bool { - self.binary - } } #[derive(Debug, Clone)] @@ -339,26 +213,6 @@ pub struct Words { } impl Check for Words { - fn check_str( - &self, - _buffer: &str, - _tokenizer: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_bytes( - &self, - _buffer: &[u8], - _tokenizer: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - fn check_file( &self, path: &std::path::Path, @@ -369,7 +223,7 @@ impl Check for Words { ) -> Result<(), std::io::Error> { let parser = typos::ParserBuilder::new().tokenizer(tokenizer).words(); - if self.check_filenames() { + if self.check_filenames { if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { for word in parser.parse_str(file_name) { let msg = report::Parse { @@ -382,16 +236,20 @@ impl Check for Words { } } - if self.check_files() { + if self.check_files { let buffer = read_file(path, reporter)?; let (buffer, content_type) = massage_data(buffer)?; - if !explicit && !self.binary() && content_type.is_binary() { + if !explicit && !self.binary && content_type.is_binary() { let msg = report::BinaryFile { path }; reporter.report(msg.into())?; } else { for word in parser.parse_bytes(&buffer) { + // HACK: Don't look up the line_num per entry to better match the performance + // of Typos for comparison purposes. We don't really get much out of it + // anyway. + let line_num = 0; let msg = report::Parse { - context: Some(report::FileContext { path, line_num: 0 }.into()), + context: Some(report::FileContext { path, line_num }.into()), kind: report::ParseKind::Word, data: word.token(), }; @@ -402,18 +260,6 @@ impl Check for Words { Ok(()) } - - fn check_filenames(&self) -> bool { - self.check_filenames - } - - fn check_files(&self) -> bool { - self.check_files - } - - fn binary(&self) -> bool { - self.binary - } } #[derive(Debug, Clone)] @@ -422,59 +268,6 @@ pub struct FoundFiles { } impl Check for FoundFiles { - fn check_str( - &self, - _buffer: &str, - _parser: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_bytes( - &self, - _buffer: &[u8], - _parser: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_filenames(&self) -> bool { - true - } - - fn check_files(&self) -> bool { - true - } - - fn binary(&self) -> bool { - self.binary - } - - fn check_filename( - &self, - _path: &std::path::Path, - _parser: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_file_content( - &self, - _path: &std::path::Path, - _explicit: bool, - _parser: &tokens::Tokenizer, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - fn check_file( &self, path: &std::path::Path, @@ -533,6 +326,45 @@ fn massage_data( Ok((buffer, content_type)) } +struct AccumulateLineNum { + line_num: usize, + last_offset: usize, +} + +impl AccumulateLineNum { + fn new() -> Self { + Self { + // 1-indexed + line_num: 1, + last_offset: 0, + } + } + + fn line_num(&mut self, buffer: &[u8], byte_offset: usize) -> usize { + assert!(self.last_offset <= byte_offset); + let slice = &buffer[self.last_offset..byte_offset]; + let newlines = slice.lines().count(); + let line_num = self.line_num + newlines; + self.line_num = line_num; + self.last_offset = byte_offset; + line_num + } +} + +fn extract_line(buffer: &[u8], byte_offset: usize) -> (&[u8], usize) { + let line_start = buffer[0..byte_offset] + .rfind_byte(b'\n') + // Skip the newline + .map(|s| s + 1) + .unwrap_or(0); + let line = buffer[line_start..] + .lines() + .next() + .expect("should always be at least a line"); + let line_offset = byte_offset - line_start; + (line, line_offset) +} + pub fn check_path( walk: ignore::Walk, checks: &dyn Check,