refactor(typos): Decouple parsing from checks

This commit is contained in:
Ed Page 2020-12-30 18:58:35 -06:00
parent 1e64080c05
commit e741f96de3
5 changed files with 250 additions and 111 deletions

View file

@ -3,7 +3,6 @@ use bstr::ByteSlice;
use crate::report; use crate::report;
use crate::tokens; use crate::tokens;
use crate::Dictionary; use crate::Dictionary;
use crate::Status;
pub trait Check: Send + Sync { pub trait Check: Send + Sync {
fn check_str( fn check_str(
@ -172,44 +171,23 @@ impl Check for Typos {
fn check_str( fn check_str(
&self, &self,
buffer: &str, buffer: &str,
parser: &tokens::Tokenizer, tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary, dictionary: &dyn Dictionary,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
for ident in parser.parse_str(buffer) { let parser = crate::ParserBuilder::new()
match dictionary.correct_ident(ident) { .tokenizer(tokenizer)
Some(Status::Valid) => {} .dictionary(dictionary)
Some(corrections) => { .typos();
let byte_offset = ident.offset(); for typo in parser.parse_str(buffer) {
let msg = report::Typo { let msg = report::Typo {
context: None, context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset, byte_offset: typo.byte_offset,
typo: ident.token(), typo: typo.typo,
corrections, corrections: typo.corrections,
}; };
reporter.report(msg.into())?; reporter.report(msg.into())?;
}
None => {
for word in ident.split() {
match dictionary.correct_word(word) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = word.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset,
typo: word.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {}
}
}
}
}
} }
Ok(()) Ok(())
} }
@ -217,46 +195,24 @@ impl Check for Typos {
fn check_bytes( fn check_bytes(
&self, &self,
buffer: &[u8], buffer: &[u8],
parser: &tokens::Tokenizer, tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary, dictionary: &dyn Dictionary,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
for ident in parser.parse_bytes(buffer) { let parser = crate::ParserBuilder::new()
match dictionary.correct_ident(ident) { .tokenizer(tokenizer)
Some(Status::Valid) => {} .dictionary(dictionary)
Some(corrections) => { .typos();
let byte_offset = ident.offset(); for typo in parser.parse_bytes(buffer) {
let msg = report::Typo { let msg = report::Typo {
context: None, context: None,
buffer: std::borrow::Cow::Borrowed(buffer), buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset, byte_offset: typo.byte_offset,
typo: ident.token(), typo: typo.typo,
corrections, corrections: typo.corrections,
}; };
reporter.report(msg.into())?; reporter.report(msg.into())?;
}
None => {
for word in ident.split() {
match dictionary.correct_word(word) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = word.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer),
byte_offset,
typo: word.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {}
}
}
}
}
} }
Ok(()) Ok(())
} }
@ -284,16 +240,19 @@ impl Check for ParseIdentifiers {
fn check_str( fn check_str(
&self, &self,
buffer: &str, buffer: &str,
parser: &tokens::Tokenizer, tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary, _dictionary: &dyn Dictionary,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let msg = report::Parse { let parser = crate::ParserBuilder::new()
context: None, .tokenizer(tokenizer)
kind: report::ParseKind::Identifier, .identifiers();
data: parser.parse_str(buffer).map(|i| i.token()).collect(), for word in parser.parse_str(buffer) {
}; let msg = report::Parse {
if !msg.data.is_empty() { context: None,
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?; reporter.report(msg.into())?;
} }
@ -303,16 +262,19 @@ impl Check for ParseIdentifiers {
fn check_bytes( fn check_bytes(
&self, &self,
buffer: &[u8], buffer: &[u8],
parser: &tokens::Tokenizer, tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary, _dictionary: &dyn Dictionary,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let msg = report::Parse { let parser = crate::ParserBuilder::new()
context: None, .tokenizer(tokenizer)
kind: report::ParseKind::Identifier, .identifiers();
data: parser.parse_bytes(buffer).map(|i| i.token()).collect(), for word in parser.parse_bytes(buffer) {
}; let msg = report::Parse {
if !msg.data.is_empty() { context: None,
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?; reporter.report(msg.into())?;
} }
@ -343,19 +305,17 @@ impl Check for ParseWords {
fn check_str( fn check_str(
&self, &self,
buffer: &str, buffer: &str,
parser: &tokens::Tokenizer, tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary, _dictionary: &dyn Dictionary,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let msg = report::Parse { let word_parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
context: None, for word in word_parser.parse_str(buffer) {
kind: report::ParseKind::Word, let msg = report::Parse {
data: parser context: None,
.parse_str(buffer) kind: report::ParseKind::Word,
.flat_map(|ident| ident.split().map(|i| i.token())) data: word.token(),
.collect(), };
};
if !msg.data.is_empty() {
reporter.report(msg.into())?; reporter.report(msg.into())?;
} }
@ -365,19 +325,17 @@ impl Check for ParseWords {
fn check_bytes( fn check_bytes(
&self, &self,
buffer: &[u8], buffer: &[u8],
parser: &tokens::Tokenizer, tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary, _dictionary: &dyn Dictionary,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
let msg = report::Parse { let parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
context: None, for word in parser.parse_bytes(buffer) {
kind: report::ParseKind::Word, let msg = report::Parse {
data: parser context: None,
.parse_bytes(buffer) kind: report::ParseKind::Word,
.flat_map(|ident| ident.split().map(|i| i.token())) data: word.token(),
.collect(), };
};
if !msg.data.is_empty() {
reporter.report(msg.into())?; reporter.report(msg.into())?;
} }

View file

@ -47,3 +47,18 @@ pub trait Dictionary: Send + Sync {
fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>; fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>;
} }
pub(crate) struct NullDictionary;
impl Dictionary for NullDictionary {
fn correct_ident<'s, 'w>(
&'s self,
_ident: crate::tokens::Identifier<'w>,
) -> Option<Status<'s>> {
None
}
fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option<Status<'s>> {
None
}
}

View file

@ -1,7 +1,9 @@
mod dict; mod dict;
mod parser;
pub mod checks; pub mod checks;
pub mod report; pub mod report;
pub mod tokens; pub mod tokens;
pub use crate::dict::*; pub use dict::*;
pub use parser::*;

164
crates/typos/src/parser.rs Normal file
View file

@ -0,0 +1,164 @@
use crate::tokens;
use crate::Dictionary;
#[derive(Clone)]
pub struct ParserBuilder<'p, 'd> {
tokenizer: Option<&'p tokens::Tokenizer>,
dictionary: &'d dyn Dictionary,
}
impl<'p> ParserBuilder<'p, 'static> {
pub fn new() -> Self {
Default::default()
}
}
impl<'p, 'd> ParserBuilder<'p, 'd> {
pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self {
self.tokenizer = Some(tokenizer);
self
}
pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> {
ParserBuilder {
tokenizer: self.tokenizer,
dictionary: dictionary,
}
}
pub fn typos(&self) -> TyposParser<'p, 'd> {
TyposParser {
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
dictionary: self.dictionary,
}
}
pub fn identifiers(&self) -> IdentifiersParser<'p> {
IdentifiersParser {
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
}
}
pub fn words(&self) -> WordsParser<'p> {
WordsParser {
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
}
}
}
impl<'p> Default for ParserBuilder<'p, 'static> {
fn default() -> Self {
Self {
tokenizer: None,
dictionary: &crate::NullDictionary,
}
}
}
static DEFAULT_TOKENIZER: once_cell::sync::Lazy<tokens::Tokenizer> =
once_cell::sync::Lazy::new(|| tokens::Tokenizer::new());
#[derive(Clone)]
pub struct TyposParser<'p, 'd> {
tokenizer: &'p tokens::Tokenizer,
dictionary: &'d dyn Dictionary,
}
impl<'p, 'd> TyposParser<'p, 'd> {
pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator<Item = Typo<'b>> {
self.tokenizer
.parse_str(buffer)
.flat_map(move |ident| self.process_ident(ident))
}
pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator<Item = Typo<'b>> {
self.tokenizer
.parse_bytes(buffer)
.flat_map(move |ident| self.process_ident(ident))
}
fn process_ident<'i, 's: 'i>(
&'s self,
ident: tokens::Identifier<'i>,
) -> impl Iterator<Item = Typo<'i>> {
match self.dictionary.correct_ident(ident) {
Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()),
Some(corrections) => {
let typo = Typo {
byte_offset: ident.offset(),
typo: ident.token(),
corrections,
};
itertools::Either::Left(Some(typo).into_iter())
}
None => itertools::Either::Right(
ident
.split()
.filter_map(move |word| self.process_word(word)),
),
}
}
fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option<Typo<'w>> {
match self.dictionary.correct_word(word) {
Some(crate::Status::Valid) => None,
Some(corrections) => {
let typo = Typo {
byte_offset: word.offset(),
typo: word.token(),
corrections,
};
Some(typo)
}
None => None,
}
}
}
#[derive(Clone, Debug, derive_setters::Setters)]
#[non_exhaustive]
pub struct Typo<'m> {
pub byte_offset: usize,
pub typo: &'m str,
pub corrections: crate::Status<'m>,
}
impl<'m> Default for Typo<'m> {
fn default() -> Self {
Self {
byte_offset: 0,
typo: "",
corrections: crate::Status::Invalid,
}
}
}
#[derive(Debug, Clone)]
pub struct IdentifiersParser<'p> {
tokenizer: &'p tokens::Tokenizer,
}
impl<'p> IdentifiersParser<'p> {
pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Identifier<'p>> {
self.tokenizer.parse_str(buffer)
}
pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Identifier<'p>> {
self.tokenizer.parse_bytes(buffer)
}
}
#[derive(Debug, Clone)]
pub struct WordsParser<'p> {
tokenizer: &'p tokens::Tokenizer,
}
impl<'p> WordsParser<'p> {
pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Word<'p>> {
self.tokenizer.parse_str(buffer).flat_map(|i| i.split())
}
pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Word<'p>> {
self.tokenizer.parse_bytes(buffer).flat_map(|i| i.split())
}
}

View file

@ -168,7 +168,7 @@ pub struct Parse<'m> {
#[serde(flatten)] #[serde(flatten)]
pub context: Option<Context<'m>>, pub context: Option<Context<'m>>,
pub kind: ParseKind, pub kind: ParseKind,
pub data: Vec<&'m str>, pub data: &'m str,
} }
impl<'m> Default for Parse<'m> { impl<'m> Default for Parse<'m> {
@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> {
Self { Self {
context: None, context: None,
kind: ParseKind::Identifier, kind: ParseKind::Identifier,
data: vec![], data: "",
} }
} }
} }
@ -265,7 +265,7 @@ impl Report for PrintBrief {
writeln!(io::stdout(), "{}", msg.path.display())?; writeln!(io::stdout(), "{}", msg.path.display())?;
} }
Message::Parse(msg) => { Message::Parse(msg) => {
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?; writeln!(io::stdout(), "{}", msg.data)?;
} }
Message::Error(msg) => { Message::Error(msg) => {
log::error!("{}: {}", context_display(&msg.context), msg.msg); log::error!("{}: {}", context_display(&msg.context), msg.msg);
@ -289,7 +289,7 @@ impl Report for PrintLong {
writeln!(io::stdout(), "{}", msg.path.display())?; writeln!(io::stdout(), "{}", msg.path.display())?;
} }
Message::Parse(msg) => { Message::Parse(msg) => {
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?; writeln!(io::stdout(), "{}", msg.data)?;
} }
Message::Error(msg) => { Message::Error(msg) => {
log::error!("{}: {}", context_display(&msg.context), msg.msg); log::error!("{}: {}", context_display(&msg.context), msg.msg);