refactor(typos): Decouple parsing from checks

This commit is contained in:
Ed Page 2020-12-30 18:58:35 -06:00
parent 1e64080c05
commit e741f96de3
5 changed files with 250 additions and 111 deletions

View file

@ -3,7 +3,6 @@ use bstr::ByteSlice;
use crate::report;
use crate::tokens;
use crate::Dictionary;
use crate::Status;
pub trait Check: Send + Sync {
fn check_str(
@ -172,44 +171,23 @@ impl Check for Typos {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Tokenizer,
tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
for ident in parser.parse_str(buffer) {
match dictionary.correct_ident(ident) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = ident.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset,
typo: ident.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {
for word in ident.split() {
match dictionary.correct_word(word) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = word.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset,
typo: word.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {}
}
}
}
}
let parser = crate::ParserBuilder::new()
.tokenizer(tokenizer)
.dictionary(dictionary)
.typos();
for typo in parser.parse_str(buffer) {
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset: typo.byte_offset,
typo: typo.typo,
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
Ok(())
}
@ -217,46 +195,24 @@ impl Check for Typos {
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Tokenizer,
tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
for ident in parser.parse_bytes(buffer) {
match dictionary.correct_ident(ident) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = ident.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer),
byte_offset,
typo: ident.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {
for word in ident.split() {
match dictionary.correct_word(word) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = word.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer),
byte_offset,
typo: word.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {}
}
}
}
}
let parser = crate::ParserBuilder::new()
.tokenizer(tokenizer)
.dictionary(dictionary)
.typos();
for typo in parser.parse_bytes(buffer) {
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset: typo.byte_offset,
typo: typo.typo,
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
Ok(())
}
@ -284,16 +240,19 @@ impl Check for ParseIdentifiers {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Tokenizer,
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Identifier,
data: parser.parse_str(buffer).map(|i| i.token()).collect(),
};
if !msg.data.is_empty() {
let parser = crate::ParserBuilder::new()
.tokenizer(tokenizer)
.identifiers();
for word in parser.parse_str(buffer) {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
@ -303,16 +262,19 @@ impl Check for ParseIdentifiers {
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Tokenizer,
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Identifier,
data: parser.parse_bytes(buffer).map(|i| i.token()).collect(),
};
if !msg.data.is_empty() {
let parser = crate::ParserBuilder::new()
.tokenizer(tokenizer)
.identifiers();
for word in parser.parse_bytes(buffer) {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
@ -343,19 +305,17 @@ impl Check for ParseWords {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Tokenizer,
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: parser
.parse_str(buffer)
.flat_map(|ident| ident.split().map(|i| i.token()))
.collect(),
};
if !msg.data.is_empty() {
let word_parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
for word in word_parser.parse_str(buffer) {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
@ -365,19 +325,17 @@ impl Check for ParseWords {
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Tokenizer,
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: parser
.parse_bytes(buffer)
.flat_map(|ident| ident.split().map(|i| i.token()))
.collect(),
};
if !msg.data.is_empty() {
let parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
for word in parser.parse_bytes(buffer) {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}

View file

@ -47,3 +47,18 @@ pub trait Dictionary: Send + Sync {
fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>;
}
pub(crate) struct NullDictionary;
impl Dictionary for NullDictionary {
fn correct_ident<'s, 'w>(
&'s self,
_ident: crate::tokens::Identifier<'w>,
) -> Option<Status<'s>> {
None
}
fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option<Status<'s>> {
None
}
}

View file

@ -1,7 +1,9 @@
mod dict;
mod parser;
pub mod checks;
pub mod report;
pub mod tokens;
pub use crate::dict::*;
pub use dict::*;
pub use parser::*;

164
crates/typos/src/parser.rs Normal file
View file

@ -0,0 +1,164 @@
use crate::tokens;
use crate::Dictionary;
#[derive(Clone)]
pub struct ParserBuilder<'p, 'd> {
tokenizer: Option<&'p tokens::Tokenizer>,
dictionary: &'d dyn Dictionary,
}
impl<'p> ParserBuilder<'p, 'static> {
pub fn new() -> Self {
Default::default()
}
}
impl<'p, 'd> ParserBuilder<'p, 'd> {
pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self {
self.tokenizer = Some(tokenizer);
self
}
pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> {
ParserBuilder {
tokenizer: self.tokenizer,
dictionary: dictionary,
}
}
pub fn typos(&self) -> TyposParser<'p, 'd> {
TyposParser {
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
dictionary: self.dictionary,
}
}
pub fn identifiers(&self) -> IdentifiersParser<'p> {
IdentifiersParser {
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
}
}
pub fn words(&self) -> WordsParser<'p> {
WordsParser {
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
}
}
}
impl<'p> Default for ParserBuilder<'p, 'static> {
fn default() -> Self {
Self {
tokenizer: None,
dictionary: &crate::NullDictionary,
}
}
}
static DEFAULT_TOKENIZER: once_cell::sync::Lazy<tokens::Tokenizer> =
once_cell::sync::Lazy::new(|| tokens::Tokenizer::new());
#[derive(Clone)]
pub struct TyposParser<'p, 'd> {
tokenizer: &'p tokens::Tokenizer,
dictionary: &'d dyn Dictionary,
}
impl<'p, 'd> TyposParser<'p, 'd> {
pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator<Item = Typo<'b>> {
self.tokenizer
.parse_str(buffer)
.flat_map(move |ident| self.process_ident(ident))
}
pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator<Item = Typo<'b>> {
self.tokenizer
.parse_bytes(buffer)
.flat_map(move |ident| self.process_ident(ident))
}
fn process_ident<'i, 's: 'i>(
&'s self,
ident: tokens::Identifier<'i>,
) -> impl Iterator<Item = Typo<'i>> {
match self.dictionary.correct_ident(ident) {
Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()),
Some(corrections) => {
let typo = Typo {
byte_offset: ident.offset(),
typo: ident.token(),
corrections,
};
itertools::Either::Left(Some(typo).into_iter())
}
None => itertools::Either::Right(
ident
.split()
.filter_map(move |word| self.process_word(word)),
),
}
}
fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option<Typo<'w>> {
match self.dictionary.correct_word(word) {
Some(crate::Status::Valid) => None,
Some(corrections) => {
let typo = Typo {
byte_offset: word.offset(),
typo: word.token(),
corrections,
};
Some(typo)
}
None => None,
}
}
}
#[derive(Clone, Debug, derive_setters::Setters)]
#[non_exhaustive]
pub struct Typo<'m> {
pub byte_offset: usize,
pub typo: &'m str,
pub corrections: crate::Status<'m>,
}
impl<'m> Default for Typo<'m> {
fn default() -> Self {
Self {
byte_offset: 0,
typo: "",
corrections: crate::Status::Invalid,
}
}
}
#[derive(Debug, Clone)]
pub struct IdentifiersParser<'p> {
tokenizer: &'p tokens::Tokenizer,
}
impl<'p> IdentifiersParser<'p> {
pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Identifier<'p>> {
self.tokenizer.parse_str(buffer)
}
pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Identifier<'p>> {
self.tokenizer.parse_bytes(buffer)
}
}
#[derive(Debug, Clone)]
pub struct WordsParser<'p> {
tokenizer: &'p tokens::Tokenizer,
}
impl<'p> WordsParser<'p> {
pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Word<'p>> {
self.tokenizer.parse_str(buffer).flat_map(|i| i.split())
}
pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Word<'p>> {
self.tokenizer.parse_bytes(buffer).flat_map(|i| i.split())
}
}

View file

@ -168,7 +168,7 @@ pub struct Parse<'m> {
#[serde(flatten)]
pub context: Option<Context<'m>>,
pub kind: ParseKind,
pub data: Vec<&'m str>,
pub data: &'m str,
}
impl<'m> Default for Parse<'m> {
@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> {
Self {
context: None,
kind: ParseKind::Identifier,
data: vec![],
data: "",
}
}
}
@ -265,7 +265,7 @@ impl Report for PrintBrief {
writeln!(io::stdout(), "{}", msg.path.display())?;
}
Message::Parse(msg) => {
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
writeln!(io::stdout(), "{}", msg.data)?;
}
Message::Error(msg) => {
log::error!("{}: {}", context_display(&msg.context), msg.msg);
@ -289,7 +289,7 @@ impl Report for PrintLong {
writeln!(io::stdout(), "{}", msg.path.display())?;
}
Message::Parse(msg) => {
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
writeln!(io::stdout(), "{}", msg.data)?;
}
Message::Error(msg) => {
log::error!("{}: {}", context_display(&msg.context), msg.msg);