diff --git a/crates/typos/src/checks.rs b/crates/typos/src/checks.rs index 62bae7d..027857c 100644 --- a/crates/typos/src/checks.rs +++ b/crates/typos/src/checks.rs @@ -3,7 +3,6 @@ use bstr::ByteSlice; use crate::report; use crate::tokens; use crate::Dictionary; -use crate::Status; pub trait Check: Send + Sync { fn check_str( @@ -172,44 +171,23 @@ impl Check for Typos { fn check_str( &self, buffer: &str, - parser: &tokens::Tokenizer, + tokenizer: &tokens::Tokenizer, dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - for ident in parser.parse_str(buffer) { - match dictionary.correct_ident(ident) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = ident.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), - byte_offset, - typo: ident.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => { - for word in ident.split() { - match dictionary.correct_word(word) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = word.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), - byte_offset, - typo: word.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => {} - } - } - } - } + let parser = crate::ParserBuilder::new() + .tokenizer(tokenizer) + .dictionary(dictionary) + .typos(); + for typo in parser.parse_str(buffer) { + let msg = report::Typo { + context: None, + buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), + byte_offset: typo.byte_offset, + typo: typo.typo, + corrections: typo.corrections, + }; + reporter.report(msg.into())?; } Ok(()) } @@ -217,46 +195,24 @@ impl Check for Typos { fn check_bytes( &self, buffer: &[u8], - parser: &tokens::Tokenizer, + tokenizer: &tokens::Tokenizer, dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - for ident in parser.parse_bytes(buffer) { - match dictionary.correct_ident(ident) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = ident.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer), - byte_offset, - typo: ident.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => { - for word in ident.split() { - match dictionary.correct_word(word) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = word.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer), - byte_offset, - typo: word.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => {} - } - } - } - } + let parser = crate::ParserBuilder::new() + .tokenizer(tokenizer) + .dictionary(dictionary) + .typos(); + for typo in parser.parse_bytes(buffer) { + let msg = report::Typo { + context: None, + buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), + byte_offset: typo.byte_offset, + typo: typo.typo, + corrections: typo.corrections, + }; + reporter.report(msg.into())?; } - Ok(()) } @@ -284,16 +240,19 @@ impl Check for ParseIdentifiers { fn check_str( &self, buffer: &str, - parser: &tokens::Tokenizer, + tokenizer: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Identifier, - data: parser.parse_str(buffer).map(|i| i.token()).collect(), - }; - if !msg.data.is_empty() { + let parser = crate::ParserBuilder::new() + .tokenizer(tokenizer) + .identifiers(); + for word in parser.parse_str(buffer) { + let msg = report::Parse { + context: None, + kind: report::ParseKind::Word, + data: word.token(), + }; reporter.report(msg.into())?; } @@ -303,16 +262,19 @@ impl Check for ParseIdentifiers { fn check_bytes( &self, buffer: &[u8], - parser: &tokens::Tokenizer, + tokenizer: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Identifier, - data: parser.parse_bytes(buffer).map(|i| i.token()).collect(), - }; - if !msg.data.is_empty() { + let parser = crate::ParserBuilder::new() + .tokenizer(tokenizer) + .identifiers(); + for word in parser.parse_bytes(buffer) { + let msg = report::Parse { + context: None, + kind: report::ParseKind::Word, + data: word.token(), + }; reporter.report(msg.into())?; } @@ -343,19 +305,17 @@ impl Check for ParseWords { fn check_str( &self, buffer: &str, - parser: &tokens::Tokenizer, + tokenizer: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Word, - data: parser - .parse_str(buffer) - .flat_map(|ident| ident.split().map(|i| i.token())) - .collect(), - }; - if !msg.data.is_empty() { + let word_parser = crate::ParserBuilder::new().tokenizer(tokenizer).words(); + for word in word_parser.parse_str(buffer) { + let msg = report::Parse { + context: None, + kind: report::ParseKind::Word, + data: word.token(), + }; reporter.report(msg.into())?; } @@ -365,19 +325,17 @@ impl Check for ParseWords { fn check_bytes( &self, buffer: &[u8], - parser: &tokens::Tokenizer, + tokenizer: &tokens::Tokenizer, _dictionary: &dyn Dictionary, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Word, - data: parser - .parse_bytes(buffer) - .flat_map(|ident| ident.split().map(|i| i.token())) - .collect(), - }; - if !msg.data.is_empty() { + let parser = crate::ParserBuilder::new().tokenizer(tokenizer).words(); + for word in parser.parse_bytes(buffer) { + let msg = report::Parse { + context: None, + kind: report::ParseKind::Word, + data: word.token(), + }; reporter.report(msg.into())?; } diff --git a/crates/typos/src/dict.rs b/crates/typos/src/dict.rs index 2fded93..083ebe2 100644 --- a/crates/typos/src/dict.rs +++ b/crates/typos/src/dict.rs @@ -47,3 +47,18 @@ pub trait Dictionary: Send + Sync { fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option>; } + +pub(crate) struct NullDictionary; + +impl Dictionary for NullDictionary { + fn correct_ident<'s, 'w>( + &'s self, + _ident: crate::tokens::Identifier<'w>, + ) -> Option> { + None + } + + fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option> { + None + } +} diff --git a/crates/typos/src/lib.rs b/crates/typos/src/lib.rs index 1cb77c9..7c09efb 100644 --- a/crates/typos/src/lib.rs +++ b/crates/typos/src/lib.rs @@ -1,7 +1,9 @@ mod dict; +mod parser; pub mod checks; pub mod report; pub mod tokens; -pub use crate::dict::*; +pub use dict::*; +pub use parser::*; diff --git a/crates/typos/src/parser.rs b/crates/typos/src/parser.rs new file mode 100644 index 0000000..613fdad --- /dev/null +++ b/crates/typos/src/parser.rs @@ -0,0 +1,164 @@ +use crate::tokens; +use crate::Dictionary; + +#[derive(Clone)] +pub struct ParserBuilder<'p, 'd> { + tokenizer: Option<&'p tokens::Tokenizer>, + dictionary: &'d dyn Dictionary, +} + +impl<'p> ParserBuilder<'p, 'static> { + pub fn new() -> Self { + Default::default() + } +} + +impl<'p, 'd> ParserBuilder<'p, 'd> { + pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self { + self.tokenizer = Some(tokenizer); + self + } + + pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> { + ParserBuilder { + tokenizer: self.tokenizer, + dictionary: dictionary, + } + } + + pub fn typos(&self) -> TyposParser<'p, 'd> { + TyposParser { + tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER), + dictionary: self.dictionary, + } + } + + pub fn identifiers(&self) -> IdentifiersParser<'p> { + IdentifiersParser { + tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER), + } + } + + pub fn words(&self) -> WordsParser<'p> { + WordsParser { + tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER), + } + } +} + +impl<'p> Default for ParserBuilder<'p, 'static> { + fn default() -> Self { + Self { + tokenizer: None, + dictionary: &crate::NullDictionary, + } + } +} + +static DEFAULT_TOKENIZER: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| tokens::Tokenizer::new()); + +#[derive(Clone)] +pub struct TyposParser<'p, 'd> { + tokenizer: &'p tokens::Tokenizer, + dictionary: &'d dyn Dictionary, +} + +impl<'p, 'd> TyposParser<'p, 'd> { + pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator> { + self.tokenizer + .parse_str(buffer) + .flat_map(move |ident| self.process_ident(ident)) + } + + pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator> { + self.tokenizer + .parse_bytes(buffer) + .flat_map(move |ident| self.process_ident(ident)) + } + + fn process_ident<'i, 's: 'i>( + &'s self, + ident: tokens::Identifier<'i>, + ) -> impl Iterator> { + match self.dictionary.correct_ident(ident) { + Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()), + Some(corrections) => { + let typo = Typo { + byte_offset: ident.offset(), + typo: ident.token(), + corrections, + }; + itertools::Either::Left(Some(typo).into_iter()) + } + None => itertools::Either::Right( + ident + .split() + .filter_map(move |word| self.process_word(word)), + ), + } + } + + fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option> { + match self.dictionary.correct_word(word) { + Some(crate::Status::Valid) => None, + Some(corrections) => { + let typo = Typo { + byte_offset: word.offset(), + typo: word.token(), + corrections, + }; + Some(typo) + } + None => None, + } + } +} + +#[derive(Clone, Debug, derive_setters::Setters)] +#[non_exhaustive] +pub struct Typo<'m> { + pub byte_offset: usize, + pub typo: &'m str, + pub corrections: crate::Status<'m>, +} + +impl<'m> Default for Typo<'m> { + fn default() -> Self { + Self { + byte_offset: 0, + typo: "", + corrections: crate::Status::Invalid, + } + } +} + +#[derive(Debug, Clone)] +pub struct IdentifiersParser<'p> { + tokenizer: &'p tokens::Tokenizer, +} + +impl<'p> IdentifiersParser<'p> { + pub fn parse_str(&self, buffer: &'p str) -> impl Iterator> { + self.tokenizer.parse_str(buffer) + } + + pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator> { + self.tokenizer.parse_bytes(buffer) + } +} + +#[derive(Debug, Clone)] +pub struct WordsParser<'p> { + tokenizer: &'p tokens::Tokenizer, +} + +impl<'p> WordsParser<'p> { + pub fn parse_str(&self, buffer: &'p str) -> impl Iterator> { + self.tokenizer.parse_str(buffer).flat_map(|i| i.split()) + } + + pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator> { + self.tokenizer.parse_bytes(buffer).flat_map(|i| i.split()) + } +} diff --git a/crates/typos/src/report.rs b/crates/typos/src/report.rs index d2d7ce9..bce2d3d 100644 --- a/crates/typos/src/report.rs +++ b/crates/typos/src/report.rs @@ -168,7 +168,7 @@ pub struct Parse<'m> { #[serde(flatten)] pub context: Option>, pub kind: ParseKind, - pub data: Vec<&'m str>, + pub data: &'m str, } impl<'m> Default for Parse<'m> { @@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> { Self { context: None, kind: ParseKind::Identifier, - data: vec![], + data: "", } } } @@ -265,7 +265,7 @@ impl Report for PrintBrief { writeln!(io::stdout(), "{}", msg.path.display())?; } Message::Parse(msg) => { - writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?; + writeln!(io::stdout(), "{}", msg.data)?; } Message::Error(msg) => { log::error!("{}: {}", context_display(&msg.context), msg.msg); @@ -289,7 +289,7 @@ impl Report for PrintLong { writeln!(io::stdout(), "{}", msg.path.display())?; } Message::Parse(msg) => { - writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?; + writeln!(io::stdout(), "{}", msg.data)?; } Message::Error(msg) => { log::error!("{}: {}", context_display(&msg.context), msg.msg);