refactor(typos): Decouple parsing from checks

2024-11-22 00:51:11 -05:00 · 2020-12-30 18:58:35 -06:00 · 2020-12-30 18:58:35 -06:00 · e741f96de3
commit e741f96de3
parent 1e64080c05
5 changed files with 250 additions and 111 deletions
--- a/crates/typos/src/checks.rs
+++ b/crates/typos/src/checks.rs
@ -3,7 +3,6 @@ use bstr::ByteSlice;
 use crate::report;
 use crate::tokens;
 use crate::Dictionary;
 use crate::Status;
 pub trait Check: Send + Sync {
    fn check_str(
@ -172,44 +171,23 @@ impl Check for Typos {
    fn check_str(
        &self,
        buffer: &str,
-        parser: &tokens::Tokenizer,
+        tokenizer: &tokens::Tokenizer,
        dictionary: &dyn Dictionary,
        reporter: &dyn report::Report,
    ) -> Result<(), std::io::Error> {
-        for ident in parser.parse_str(buffer) {
+        let parser = crate::ParserBuilder::new()
-            match dictionary.correct_ident(ident) {
+            .tokenizer(tokenizer)
-                Some(Status::Valid) => {}
+            .dictionary(dictionary)
-                Some(corrections) => {
+            .typos();
-                    let byte_offset = ident.offset();
+        for typo in parser.parse_str(buffer) {
-                    let msg = report::Typo {
+            let msg = report::Typo {
-                        context: None,
+                context: None,
-                        buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
+                buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
-                        byte_offset,
+                byte_offset: typo.byte_offset,
-                        typo: ident.token(),
+                typo: typo.typo,
-                        corrections,
+                corrections: typo.corrections,
-                    };
+            };
-                    reporter.report(msg.into())?;
+            reporter.report(msg.into())?;
                }
                None => {
                    for word in ident.split() {
                        match dictionary.correct_word(word) {
                            Some(Status::Valid) => {}
                            Some(corrections) => {
                                let byte_offset = word.offset();
                                let msg = report::Typo {
                                    context: None,
                                    buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
                                    byte_offset,
                                    typo: word.token(),
                                    corrections,
                                };
                                reporter.report(msg.into())?;
                            }
                            None => {}
                        }
                    }
                }
            }
        }
        Ok(())
    }
@ -217,46 +195,24 @@ impl Check for Typos {
    fn check_bytes(
        &self,
        buffer: &[u8],
-        parser: &tokens::Tokenizer,
+        tokenizer: &tokens::Tokenizer,
        dictionary: &dyn Dictionary,
        reporter: &dyn report::Report,
    ) -> Result<(), std::io::Error> {
-        for ident in parser.parse_bytes(buffer) {
+        let parser = crate::ParserBuilder::new()
-            match dictionary.correct_ident(ident) {
+            .tokenizer(tokenizer)
-                Some(Status::Valid) => {}
+            .dictionary(dictionary)
-                Some(corrections) => {
+            .typos();
-                    let byte_offset = ident.offset();
+        for typo in parser.parse_bytes(buffer) {
-                    let msg = report::Typo {
+            let msg = report::Typo {
-                        context: None,
+                context: None,
-                        buffer: std::borrow::Cow::Borrowed(buffer),
+                buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
-                        byte_offset,
+                byte_offset: typo.byte_offset,
-                        typo: ident.token(),
+                typo: typo.typo,
-                        corrections,
+                corrections: typo.corrections,
-                    };
+            };
-                    reporter.report(msg.into())?;
+            reporter.report(msg.into())?;
                }
                None => {
                    for word in ident.split() {
                        match dictionary.correct_word(word) {
                            Some(Status::Valid) => {}
                            Some(corrections) => {
                                let byte_offset = word.offset();
                                let msg = report::Typo {
                                    context: None,
                                    buffer: std::borrow::Cow::Borrowed(buffer),
                                    byte_offset,
                                    typo: word.token(),
                                    corrections,
                                };
                                reporter.report(msg.into())?;
                            }
                            None => {}
                        }
                    }
                }
            }
        }
        Ok(())
    }
@ -284,16 +240,19 @@ impl Check for ParseIdentifiers {
    fn check_str(
        &self,
        buffer: &str,
-        parser: &tokens::Tokenizer,
+        tokenizer: &tokens::Tokenizer,
        _dictionary: &dyn Dictionary,
        reporter: &dyn report::Report,
    ) -> Result<(), std::io::Error> {
-        let msg = report::Parse {
+        let parser = crate::ParserBuilder::new()
-            context: None,
+            .tokenizer(tokenizer)
-            kind: report::ParseKind::Identifier,
+            .identifiers();
-            data: parser.parse_str(buffer).map(|i| i.token()).collect(),
+        for word in parser.parse_str(buffer) {
-        };
+            let msg = report::Parse {
-        if !msg.data.is_empty() {
+                context: None,
                kind: report::ParseKind::Word,
                data: word.token(),
            };
            reporter.report(msg.into())?;
        }
@ -303,16 +262,19 @@ impl Check for ParseIdentifiers {
    fn check_bytes(
        &self,
        buffer: &[u8],
-        parser: &tokens::Tokenizer,
+        tokenizer: &tokens::Tokenizer,
        _dictionary: &dyn Dictionary,
        reporter: &dyn report::Report,
    ) -> Result<(), std::io::Error> {
-        let msg = report::Parse {
+        let parser = crate::ParserBuilder::new()
-            context: None,
+            .tokenizer(tokenizer)
-            kind: report::ParseKind::Identifier,
+            .identifiers();
-            data: parser.parse_bytes(buffer).map(|i| i.token()).collect(),
+        for word in parser.parse_bytes(buffer) {
-        };
+            let msg = report::Parse {
-        if !msg.data.is_empty() {
+                context: None,
                kind: report::ParseKind::Word,
                data: word.token(),
            };
            reporter.report(msg.into())?;
        }
@ -343,19 +305,17 @@ impl Check for ParseWords {
    fn check_str(
        &self,
        buffer: &str,
-        parser: &tokens::Tokenizer,
+        tokenizer: &tokens::Tokenizer,
        _dictionary: &dyn Dictionary,
        reporter: &dyn report::Report,
    ) -> Result<(), std::io::Error> {
-        let msg = report::Parse {
+        let word_parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
-            context: None,
+        for word in word_parser.parse_str(buffer) {
-            kind: report::ParseKind::Word,
+            let msg = report::Parse {
-            data: parser
+                context: None,
-                .parse_str(buffer)
+                kind: report::ParseKind::Word,
-                .flat_map(|ident| ident.split().map(|i| i.token()))
+                data: word.token(),
-                .collect(),
+            };
        };
        if !msg.data.is_empty() {
            reporter.report(msg.into())?;
        }
@ -365,19 +325,17 @@ impl Check for ParseWords {
    fn check_bytes(
        &self,
        buffer: &[u8],
-        parser: &tokens::Tokenizer,
+        tokenizer: &tokens::Tokenizer,
        _dictionary: &dyn Dictionary,
        reporter: &dyn report::Report,
    ) -> Result<(), std::io::Error> {
-        let msg = report::Parse {
+        let parser = crate::ParserBuilder::new().tokenizer(tokenizer).words();
-            context: None,
+        for word in parser.parse_bytes(buffer) {
-            kind: report::ParseKind::Word,
+            let msg = report::Parse {
-            data: parser
+                context: None,
-                .parse_bytes(buffer)
+                kind: report::ParseKind::Word,
-                .flat_map(|ident| ident.split().map(|i| i.token()))
+                data: word.token(),
-                .collect(),
+            };
        };
        if !msg.data.is_empty() {
            reporter.report(msg.into())?;
        }
--- a/crates/typos/src/dict.rs
+++ b/crates/typos/src/dict.rs
@ -47,3 +47,18 @@ pub trait Dictionary: Send + Sync {
    fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>;
 }
 pub(crate) struct NullDictionary;
 impl Dictionary for NullDictionary {
    fn correct_ident<'s, 'w>(
        &'s self,
        _ident: crate::tokens::Identifier<'w>,
    ) -> Option<Status<'s>> {
        None
    }
    fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option<Status<'s>> {
        None
    }
 }
--- a/crates/typos/src/lib.rs
+++ b/crates/typos/src/lib.rs
@ -1,7 +1,9 @@
 mod dict;
 mod parser;
 pub mod checks;
 pub mod report;
 pub mod tokens;
-pub use crate::dict::*;
+pub use dict::*;
 pub use parser::*;
--- a/crates/typos/src/parser.rs
+++ b/crates/typos/src/parser.rs
@ -0,0 +1,164 @@
 use crate::tokens;
 use crate::Dictionary;
 #[derive(Clone)]
 pub struct ParserBuilder<'p, 'd> {
    tokenizer: Option<&'p tokens::Tokenizer>,
    dictionary: &'d dyn Dictionary,
 }
 impl<'p> ParserBuilder<'p, 'static> {
    pub fn new() -> Self {
        Default::default()
    }
 }
 impl<'p, 'd> ParserBuilder<'p, 'd> {
    pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self {
        self.tokenizer = Some(tokenizer);
        self
    }
    pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> {
        ParserBuilder {
            tokenizer: self.tokenizer,
            dictionary: dictionary,
        }
    }
    pub fn typos(&self) -> TyposParser<'p, 'd> {
        TyposParser {
            tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
            dictionary: self.dictionary,
        }
    }
    pub fn identifiers(&self) -> IdentifiersParser<'p> {
        IdentifiersParser {
            tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
        }
    }
    pub fn words(&self) -> WordsParser<'p> {
        WordsParser {
            tokenizer: self.tokenizer.unwrap_or_else(|| &DEFAULT_TOKENIZER),
        }
    }
 }
 impl<'p> Default for ParserBuilder<'p, 'static> {
    fn default() -> Self {
        Self {
            tokenizer: None,
            dictionary: &crate::NullDictionary,
        }
    }
 }
 static DEFAULT_TOKENIZER: once_cell::sync::Lazy<tokens::Tokenizer> =
    once_cell::sync::Lazy::new(|| tokens::Tokenizer::new());
 #[derive(Clone)]
 pub struct TyposParser<'p, 'd> {
    tokenizer: &'p tokens::Tokenizer,
    dictionary: &'d dyn Dictionary,
 }
 impl<'p, 'd> TyposParser<'p, 'd> {
    pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator<Item = Typo<'b>> {
        self.tokenizer
            .parse_str(buffer)
            .flat_map(move |ident| self.process_ident(ident))
    }
    pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator<Item = Typo<'b>> {
        self.tokenizer
            .parse_bytes(buffer)
            .flat_map(move |ident| self.process_ident(ident))
    }
    fn process_ident<'i, 's: 'i>(
        &'s self,
        ident: tokens::Identifier<'i>,
    ) -> impl Iterator<Item = Typo<'i>> {
        match self.dictionary.correct_ident(ident) {
            Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()),
            Some(corrections) => {
                let typo = Typo {
                    byte_offset: ident.offset(),
                    typo: ident.token(),
                    corrections,
                };
                itertools::Either::Left(Some(typo).into_iter())
            }
            None => itertools::Either::Right(
                ident
                    .split()
                    .filter_map(move |word| self.process_word(word)),
            ),
        }
    }
    fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option<Typo<'w>> {
        match self.dictionary.correct_word(word) {
            Some(crate::Status::Valid) => None,
            Some(corrections) => {
                let typo = Typo {
                    byte_offset: word.offset(),
                    typo: word.token(),
                    corrections,
                };
                Some(typo)
            }
            None => None,
        }
    }
 }
 #[derive(Clone, Debug, derive_setters::Setters)]
 #[non_exhaustive]
 pub struct Typo<'m> {
    pub byte_offset: usize,
    pub typo: &'m str,
    pub corrections: crate::Status<'m>,
 }
 impl<'m> Default for Typo<'m> {
    fn default() -> Self {
        Self {
            byte_offset: 0,
            typo: "",
            corrections: crate::Status::Invalid,
        }
    }
 }
 #[derive(Debug, Clone)]
 pub struct IdentifiersParser<'p> {
    tokenizer: &'p tokens::Tokenizer,
 }
 impl<'p> IdentifiersParser<'p> {
    pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Identifier<'p>> {
        self.tokenizer.parse_str(buffer)
    }
    pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Identifier<'p>> {
        self.tokenizer.parse_bytes(buffer)
    }
 }
 #[derive(Debug, Clone)]
 pub struct WordsParser<'p> {
    tokenizer: &'p tokens::Tokenizer,
 }
 impl<'p> WordsParser<'p> {
    pub fn parse_str(&self, buffer: &'p str) -> impl Iterator<Item = tokens::Word<'p>> {
        self.tokenizer.parse_str(buffer).flat_map(|i| i.split())
    }
    pub fn parse_bytes(&self, buffer: &'p [u8]) -> impl Iterator<Item = tokens::Word<'p>> {
        self.tokenizer.parse_bytes(buffer).flat_map(|i| i.split())
    }
 }
--- a/crates/typos/src/report.rs
+++ b/crates/typos/src/report.rs
@ -168,7 +168,7 @@ pub struct Parse<'m> {
    #[serde(flatten)]
    pub context: Option<Context<'m>>,
    pub kind: ParseKind,
-    pub data: Vec<&'m str>,
+    pub data: &'m str,
 }
 impl<'m> Default for Parse<'m> {
@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> {
        Self {
            context: None,
            kind: ParseKind::Identifier,
-            data: vec![],
+            data: "",
        }
    }
 }
@ -265,7 +265,7 @@ impl Report for PrintBrief {
                writeln!(io::stdout(), "{}", msg.path.display())?;
            }
            Message::Parse(msg) => {
-                writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
+                writeln!(io::stdout(), "{}", msg.data)?;
            }
            Message::Error(msg) => {
                log::error!("{}: {}", context_display(&msg.context), msg.msg);
@ -289,7 +289,7 @@ impl Report for PrintLong {
                writeln!(io::stdout(), "{}", msg.path.display())?;
            }
            Message::Parse(msg) => {
-                writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
+                writeln!(io::stdout(), "{}", msg.data)?;
            }
            Message::Error(msg) => {
                log::error!("{}: {}", context_display(&msg.context), msg.msg);