mirror of
https://github.com/crate-ci/typos.git
synced 2025-01-09 00:04:49 -05:00
refactor(typos): Decouple parsing from checks
This commit is contained in:
parent
1e64080c05
commit
e741f96de3
5 changed files with 250 additions and 111 deletions
|
@ -3,7 +3,6 @@ use bstr::ByteSlice;
|
|||
use crate::report;
|
||||
use crate::tokens;
|
||||
use crate::Dictionary;
|
||||
use crate::Status;
|
||||
|
||||
pub trait Check: Send + Sync {
|
||||
fn check_str(
|
||||
|
@ -172,44 +171,23 @@ impl Check for Typos {
|
|||
fn check_str(
|
||||
&self,
|
||||
buffer: &str,
|
||||
parser: &tokens::Tokenizer,
|
||||
tokenizer: &tokens::Tokenizer,
|
||||
dictionary: &dyn Dictionary,
|
||||
reporter: &dyn report::Report,
|
||||
) -> Result<(), std::io::Error> {
|
||||
for ident in parser.parse_str(buffer) {
|
||||
match dictionary.correct_ident(ident) {
|
||||
Some(Status::Valid) => {}
|
||||
Some(corrections) => {
|
||||
let byte_offset = ident.offset();
|
||||
let msg = report::Typo {
|
||||
context: None,
|
||||
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
|
||||
byte_offset,
|
||||
typo: ident.token(),
|
||||
corrections,
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
None => {
|
||||
for word in ident.split() {
|
||||
match dictionary.correct_word(word) {
|
||||
Some(Status::Valid) => {}
|
||||
Some(corrections) => {
|
||||
let byte_offset = word.offset();
|
||||
let msg = report::Typo {
|
||||
context: None,
|
||||
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
|
||||
byte_offset,
|
||||
typo: word.token(),
|
||||
corrections,
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let parser = crate::ParserBuilder::new()
|
||||
.tokenizer(tokenizer)
|
||||
.dictionary(dictionary)
|
||||
.typos();
|
||||
for typo in parser.parse_str(buffer) {
|
||||
let msg = report::Typo {
|
||||
context: None,
|
||||
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
|
||||
byte_offset: typo.byte_offset,
|
||||
typo: typo.typo,
|
||||
corrections: typo.corrections,
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -217,46 +195,24 @@ impl Check for Typos {
|
|||
fn check_bytes(
|
||||
&self,
|
||||
buffer: &[u8],
|
||||
parser: &tokens::Tokenizer,
|
||||
tokenizer: &tokens::Tokenizer,
|
||||
dictionary: &dyn Dictionary,
|
||||
reporter: &dyn report::Report,
|
||||
) -> Result<(), std::io::Error> {
|
||||
for ident in parser.parse_bytes(buffer) {
|
||||
match dictionary.correct_ident(ident) {
|
||||
Some(Status::Valid) => {}
|
||||
Some(corrections) => {
|
||||
let byte_offset = ident.offset();
|
||||
let msg = report::Typo {
|
||||
context: None,
|
||||
buffer: std::borrow::Cow::Borrowed(buffer),
|
||||
byte_offset,
|
||||
typo: ident.token(),
|
||||
corrections,
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
None => {
|
||||
for word in ident.split() {
|
||||
match dictionary.correct_word(word) {
|
||||
Some(Status::Valid) => {}
|
||||
Some(corrections) => {
|
||||
let byte_offset = word.offset();
|
||||
let msg = report::Typo {
|
||||
context: None,
|
||||
buffer: std::borrow::Cow::Borrowed(buffer),
|
||||
byte_offset,
|
||||
typo: word.token(),
|
||||
corrections,
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let parser = crate::ParserBuilder::new()
|
||||
.tokenizer(tokenizer)
|
||||
.dictionary(dictionary)
|
||||
.typos();
|
||||
for typo in parser.parse_bytes(buffer) {
|
||||
let msg = report::Typo {
|
||||
context: None,
|
||||
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
|
||||
byte_offset: typo.byte_offset,
|
||||
typo: typo.typo,
|
||||
corrections: typo.corrections,
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -284,16 +240,19 @@ impl Check for ParseIdentifiers {
|
|||
fn check_str(
|
||||
&self,
|
||||
buffer: &str,
|
||||
parser: &tokens::Tokenizer,
|
||||
tokenizer: &tokens::Tokenizer,
|
||||
_dictionary: &dyn Dictionary,
|
||||
reporter: &dyn report::Report,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Identifier,
|
||||
data: parser.parse_str(buffer).map(|i| i.token()).collect(),
|
||||
};
|
||||
if !msg.data.is_empty() {
|
||||
let parser = crate::ParserBuilder::new()
|
||||
.tokenizer(tokenizer)
|
||||
.identifiers();
|
||||
for word in parser.parse_str(buffer) {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Word,
|
||||
data: word.token(),
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
|
||||
|
@ -303,16 +262,19 @@ impl Check for ParseIdentifiers {
|
|||
fn check_bytes(
|
||||
&self,
|
||||
buffer: &[u8],
|
||||
parser: &tokens::Tokenizer,
|
||||
tokenizer: &tokens::Tokenizer,
|
||||
_dictionary: &dyn Dictionary,
|
||||
reporter: &dyn report::Report,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Identifier,
|
||||
data: parser.parse_bytes(buffer).map(|i| i.token()).collect(),
|
||||
};
|
||||
if !msg.data.is_empty() {
|
||||
let parser = crate::ParserBuilder::new()
|
||||
.tokenizer(tokenizer)
|
||||
.identifiers();
|
||||
for word in parser.parse_bytes(buffer) {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Word,
|
||||
data: word.token(),
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
|
||||
|
@ -343,19 +305,17 @@ impl Check for ParseWords {
|
|||
fn check_str(
|
||||
&self,
|
||||
buffer: &str,
|
||||
parser: &tokens::Tokenizer,
|
||||
tokenizer: &tokens::Tokenizer,
|
||||
_dictionary: &dyn Dictionary,
|
||||
reporter: &dyn report::Report,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Word,
|
||||
data: parser
|
||||
.parse_str(buffer)
|
||||
.flat_map(|ident| ident.split().map(|i| i.token()))
|
||||
.collect(),
|
||||
};
|
||||
if !msg.data.is_empty() {
|
||||
let word_parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
|
||||
for word in word_parser.parse_str(buffer) {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Word,
|
||||
data: word.token(),
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
|
||||
|
@ -365,19 +325,17 @@ impl Check for ParseWords {
|
|||
fn check_bytes(
|
||||
&self,
|
||||
buffer: &[u8],
|
||||
parser: &tokens::Tokenizer,
|
||||
tokenizer: &tokens::Tokenizer,
|
||||
_dictionary: &dyn Dictionary,
|
||||
reporter: &dyn report::Report,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Word,
|
||||
data: parser
|
||||
.parse_bytes(buffer)
|
||||
.flat_map(|ident| ident.split().map(|i| i.token()))
|
||||
.collect(),
|
||||
};
|
||||
if !msg.data.is_empty() {
|
||||
let parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
|
||||
for word in parser.parse_bytes(buffer) {
|
||||
let msg = report::Parse {
|
||||
context: None,
|
||||
kind: report::ParseKind::Word,
|
||||
data: word.token(),
|
||||
};
|
||||
reporter.report(msg.into())?;
|
||||
}
|
||||
|
||||
|
|
|
@ -47,3 +47,18 @@ pub trait Dictionary: Send + Sync {
|
|||
|
||||
fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>;
|
||||
}
|
||||
|
||||
pub(crate) struct NullDictionary;
|
||||
|
||||
impl Dictionary for NullDictionary {
|
||||
fn correct_ident<'s, 'w>(
|
||||
&'s self,
|
||||
_ident: crate::tokens::Identifier<'w>,
|
||||
) -> Option<Status<'s>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option<Status<'s>> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
mod dict;
|
||||
mod parser;
|
||||
|
||||
pub mod checks;
|
||||
pub mod report;
|
||||
pub mod tokens;
|
||||
|
||||
pub use crate::dict::*;
|
||||
pub use dict::*;
|
||||
pub use parser::*;
|
||||
|
|
164
crates/typos/src/parser.rs
Normal file
164
crates/typos/src/parser.rs
Normal file
|
@ -0,0 +1,164 @@
|
|||
use crate::tokens;
|
||||
use crate::Dictionary;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ParserBuilder<'p, 'd> {
|
||||
tokenizer: Option<&'p tokens::Tokenizer>,
|
||||
dictionary: &'d dyn Dictionary,
|
||||
}
|
||||
|
||||
impl<'p> ParserBuilder<'p, 'static> {
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'p, 'd> ParserBuilder<'p, 'd> {
|
||||
pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self {
|
||||
self.tokenizer = Some(tokenizer);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> {
|
||||
ParserBuilder {
|
||||
tokenizer: self.tokenizer,
|
||||
dictionary: dictionary,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn typos(&self) -> TyposParser<'p, 'd> {
|
||||
TyposParser {
|
||||
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
|
||||
dictionary: self.dictionary,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn identifiers(&self) -> IdentifiersParser<'p> {
|
||||
IdentifiersParser {
|
||||
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn words(&self) -> WordsParser<'p> {
|
||||
WordsParser {
|
||||
tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'p> Default for ParserBuilder<'p, 'static> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
tokenizer: None,
|
||||
dictionary: &crate::NullDictionary,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static DEFAULT_TOKENIZER: once_cell::sync::Lazy<tokens::Tokenizer> =
|
||||
once_cell::sync::Lazy::new(|| tokens::Tokenizer::new());
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TyposParser<'p, 'd> {
|
||||
tokenizer: &'p tokens::Tokenizer,
|
||||
dictionary: &'d dyn Dictionary,
|
||||
}
|
||||
|
||||
impl<'p, 'd> TyposParser<'p, 'd> {
|
||||
pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator<Item = Typo<'b>> {
|
||||
self.tokenizer
|
||||
.parse_str(buffer)
|
||||
.flat_map(move |ident| self.process_ident(ident))
|
||||
}
|
||||
|
||||
pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator<Item = Typo<'b>> {
|
||||
self.tokenizer
|
||||
.parse_bytes(buffer)
|
||||
.flat_map(move |ident| self.process_ident(ident))
|
||||
}
|
||||
|
||||
fn process_ident<'i, 's: 'i>(
|
||||
&'s self,
|
||||
ident: tokens::Identifier<'i>,
|
||||
) -> impl Iterator<Item = Typo<'i>> {
|
||||
match self.dictionary.correct_ident(ident) {
|
||||
Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()),
|
||||
Some(corrections) => {
|
||||
let typo = Typo {
|
||||
byte_offset: ident.offset(),
|
||||
typo: ident.token(),
|
||||
corrections,
|
||||
};
|
||||
itertools::Either::Left(Some(typo).into_iter())
|
||||
}
|
||||
None => itertools::Either::Right(
|
||||
ident
|
||||
.split()
|
||||
.filter_map(move |word| self.process_word(word)),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option<Typo<'w>> {
|
||||
match self.dictionary.correct_word(word) {
|
||||
Some(crate::Status::Valid) => None,
|
||||
Some(corrections) => {
|
||||
let typo = Typo {
|
||||
byte_offset: word.offset(),
|
||||
typo: word.token(),
|
||||
corrections,
|
||||
};
|
||||
Some(typo)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, derive_setters::Setters)]
|
||||
#[non_exhaustive]
|
||||
pub struct Typo<'m> {
|
||||
pub byte_offset: usize,
|
||||
pub typo: &'m str,
|
||||
pub corrections: crate::Status<'m>,
|
||||
}
|
||||
|
||||
impl<'m> Default for Typo<'m> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
byte_offset: 0,
|
||||
typo: "",
|
||||
corrections: crate::Status::Invalid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IdentifiersParser<'p> {
|
||||
tokenizer: &'p tokens::Tokenizer,
|
||||
}
|
||||
|
||||
impl<'p> IdentifiersParser<'p> {
|
||||
pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Identifier<'p>> {
|
||||
self.tokenizer.parse_str(buffer)
|
||||
}
|
||||
|
||||
pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Identifier<'p>> {
|
||||
self.tokenizer.parse_bytes(buffer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WordsParser<'p> {
|
||||
tokenizer: &'p tokens::Tokenizer,
|
||||
}
|
||||
|
||||
impl<'p> WordsParser<'p> {
|
||||
pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Word<'p>> {
|
||||
self.tokenizer.parse_str(buffer).flat_map(|i| i.split())
|
||||
}
|
||||
|
||||
pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Word<'p>> {
|
||||
self.tokenizer.parse_bytes(buffer).flat_map(|i| i.split())
|
||||
}
|
||||
}
|
|
@ -168,7 +168,7 @@ pub struct Parse<'m> {
|
|||
#[serde(flatten)]
|
||||
pub context: Option<Context<'m>>,
|
||||
pub kind: ParseKind,
|
||||
pub data: Vec<&'m str>,
|
||||
pub data: &'m str,
|
||||
}
|
||||
|
||||
impl<'m> Default for Parse<'m> {
|
||||
|
@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> {
|
|||
Self {
|
||||
context: None,
|
||||
kind: ParseKind::Identifier,
|
||||
data: vec![],
|
||||
data: "",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -265,7 +265,7 @@ impl Report for PrintBrief {
|
|||
writeln!(io::stdout(), "{}", msg.path.display())?;
|
||||
}
|
||||
Message::Parse(msg) => {
|
||||
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
|
||||
writeln!(io::stdout(), "{}", msg.data)?;
|
||||
}
|
||||
Message::Error(msg) => {
|
||||
log::error!("{}: {}", context_display(&msg.context), msg.msg);
|
||||
|
@ -289,7 +289,7 @@ impl Report for PrintLong {
|
|||
writeln!(io::stdout(), "{}", msg.path.display())?;
|
||||
}
|
||||
Message::Parse(msg) => {
|
||||
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
|
||||
writeln!(io::stdout(), "{}", msg.data)?;
|
||||
}
|
||||
Message::Error(msg) => {
|
||||
log::error!("{}: {}", context_display(&msg.context), msg.msg);
|
||||
|
|
Loading…
Reference in a new issue