From 663eb94d32a980ede97039a67cbe416e736f06b1 Mon Sep 17 00:00:00 2001
From: Ed Page <epage@duosecurity.com>
Date: Thu, 31 Dec 2020 17:41:32 -0600
Subject: [PATCH] refactor: Switch Typos to check_file

---
 benches/checks.rs | 236 +++++++++++++++---------------
 src/checks.rs     | 358 ++++++++++++----------------------------------
 2 files changed, 212 insertions(+), 382 deletions(-)

diff --git a/benches/checks.rs b/benches/checks.rs
index 3379392..dcb8dc2 100644
--- a/benches/checks.rs
+++ b/benches/checks.rs
@@ -7,129 +7,178 @@ mod data;
 use assert_fs::prelude::*;
 use typos_cli::checks::Check;
 
-fn bench_parse_ident_str(data: &str, b: &mut test::Bencher) {
+fn bench_files(data: &str, b: &mut test::Bencher) {
+    let temp = assert_fs::TempDir::new().unwrap();
+    let sample_path = temp.child("sample");
+    sample_path.write_str(data).unwrap();
+
     let corrections = typos_cli::dict::BuiltIn::new(Default::default());
     let parser = typos::tokens::Tokenizer::new();
-    let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser();
-    b.iter(|| checks.check_str(data, &parser, &corrections, &typos_cli::report::PrintSilent));
-}
-
-#[bench]
-fn parse_idents_empty_str(b: &mut test::Bencher) {
-    bench_parse_ident_str(data::EMPTY, b);
-}
-
-#[bench]
-fn parse_idents_no_tokens_str(b: &mut test::Bencher) {
-    bench_parse_ident_str(data::NO_TOKENS, b);
-}
-
-#[bench]
-fn parse_idents_single_token_str(b: &mut test::Bencher) {
-    bench_parse_ident_str(data::SINGLE_TOKEN, b);
-}
-
-#[bench]
-fn parse_idents_sherlock_str(b: &mut test::Bencher) {
-    bench_parse_ident_str(data::SHERLOCK, b);
-}
-
-#[bench]
-fn parse_idents_code_str(b: &mut test::Bencher) {
-    bench_parse_ident_str(data::CODE, b);
-}
-
-#[bench]
-fn parse_idents_corpus_str(b: &mut test::Bencher) {
-    bench_parse_ident_str(data::CORPUS, b);
-}
-
-fn bench_parse_ident_bytes(data: &str, b: &mut test::Bencher) {
-    let corrections = typos_cli::dict::BuiltIn::new(Default::default());
-    let parser = typos::tokens::Tokenizer::new();
-    let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser();
+    let checks = typos_cli::checks::TyposSettings::new().build_files();
     b.iter(|| {
-        checks.check_bytes(
-            data.as_bytes(),
+        checks.check_file(
+            sample_path.path(),
+            true,
             &parser,
             &corrections,
             &typos_cli::report::PrintSilent,
         )
     });
+
+    temp.close().unwrap();
 }
 
 #[bench]
-fn parse_idents_empty_bytes(b: &mut test::Bencher) {
-    bench_parse_ident_bytes(data::EMPTY, b);
+fn files_empty(b: &mut test::Bencher) {
+    bench_files(data::EMPTY, b);
 }
 
 #[bench]
-fn parse_idents_no_tokens_bytes(b: &mut test::Bencher) {
-    bench_parse_ident_bytes(data::NO_TOKENS, b);
+fn files_no_tokens(b: &mut test::Bencher) {
+    bench_files(data::NO_TOKENS, b);
 }
 
 #[bench]
-fn parse_idents_single_token_bytes(b: &mut test::Bencher) {
-    bench_parse_ident_bytes(data::SINGLE_TOKEN, b);
+fn files_single_token(b: &mut test::Bencher) {
+    bench_files(data::SINGLE_TOKEN, b);
 }
 
 #[bench]
-fn parse_idents_sherlock_bytes(b: &mut test::Bencher) {
-    bench_parse_ident_bytes(data::SHERLOCK, b);
+fn files_sherlock(b: &mut test::Bencher) {
+    bench_files(data::SHERLOCK, b);
 }
 
 #[bench]
-fn parse_idents_code_bytes(b: &mut test::Bencher) {
-    bench_parse_ident_bytes(data::CODE, b);
+fn files_code(b: &mut test::Bencher) {
+    bench_files(data::CODE, b);
 }
 
 #[bench]
-fn parse_idents_corpus_bytes(b: &mut test::Bencher) {
-    bench_parse_ident_bytes(data::CORPUS, b);
+fn files_corpus(b: &mut test::Bencher) {
+    bench_files(data::CORPUS, b);
 }
 
-fn bench_parse_word_str(data: &str, b: &mut test::Bencher) {
+fn bench_identifiers(data: &str, b: &mut test::Bencher) {
+    let temp = assert_fs::TempDir::new().unwrap();
+    let sample_path = temp.child("sample");
+    sample_path.write_str(data).unwrap();
+
+    let corrections = typos_cli::dict::BuiltIn::new(Default::default());
+    let parser = typos::tokens::Tokenizer::new();
+    let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser();
+    b.iter(|| {
+        checks.check_file(
+            sample_path.path(),
+            true,
+            &parser,
+            &corrections,
+            &typos_cli::report::PrintSilent,
+        )
+    });
+
+    temp.close().unwrap();
+}
+
+#[bench]
+fn identifiers_empty(b: &mut test::Bencher) {
+    bench_identifiers(data::EMPTY, b);
+}
+
+#[bench]
+fn identifiers_no_tokens(b: &mut test::Bencher) {
+    bench_identifiers(data::NO_TOKENS, b);
+}
+
+#[bench]
+fn identifiers_single_token(b: &mut test::Bencher) {
+    bench_identifiers(data::SINGLE_TOKEN, b);
+}
+
+#[bench]
+fn identifiers_sherlock(b: &mut test::Bencher) {
+    bench_identifiers(data::SHERLOCK, b);
+}
+
+#[bench]
+fn identifiers_code(b: &mut test::Bencher) {
+    bench_identifiers(data::CODE, b);
+}
+
+#[bench]
+fn identifiers_corpus(b: &mut test::Bencher) {
+    bench_identifiers(data::CORPUS, b);
+}
+
+fn bench_words(data: &str, b: &mut test::Bencher) {
+    let temp = assert_fs::TempDir::new().unwrap();
+    let sample_path = temp.child("sample");
+    sample_path.write_str(data).unwrap();
+
     let corrections = typos_cli::dict::BuiltIn::new(Default::default());
     let parser = typos::tokens::Tokenizer::new();
     let checks = typos_cli::checks::TyposSettings::new().build_word_parser();
-    b.iter(|| checks.check_str(data, &parser, &corrections, &typos_cli::report::PrintSilent));
+    b.iter(|| {
+        checks.check_file(
+            sample_path.path(),
+            true,
+            &parser,
+            &corrections,
+            &typos_cli::report::PrintSilent,
+        )
+    });
+
+    temp.close().unwrap();
 }
 
 #[bench]
-fn parse_words_empty(b: &mut test::Bencher) {
-    bench_parse_word_str(data::EMPTY, b);
+fn words_empty(b: &mut test::Bencher) {
+    bench_words(data::EMPTY, b);
 }
 
 #[bench]
-fn parse_words_no_tokens(b: &mut test::Bencher) {
-    bench_parse_word_str(data::NO_TOKENS, b);
+fn words_no_tokens(b: &mut test::Bencher) {
+    bench_words(data::NO_TOKENS, b);
 }
 
 #[bench]
-fn parse_words_single_token(b: &mut test::Bencher) {
-    bench_parse_word_str(data::SINGLE_TOKEN, b);
+fn words_single_token(b: &mut test::Bencher) {
+    bench_words(data::SINGLE_TOKEN, b);
 }
 
 #[bench]
-fn parse_words_sherlock(b: &mut test::Bencher) {
-    bench_parse_word_str(data::SHERLOCK, b);
+fn words_sherlock(b: &mut test::Bencher) {
+    bench_words(data::SHERLOCK, b);
 }
 
 #[bench]
-fn parse_words_code(b: &mut test::Bencher) {
-    bench_parse_word_str(data::CODE, b);
+fn words_code(b: &mut test::Bencher) {
+    bench_words(data::CODE, b);
 }
 
 #[bench]
-fn parse_words_corpus(b: &mut test::Bencher) {
-    bench_parse_word_str(data::CORPUS, b);
+fn words_corpus(b: &mut test::Bencher) {
+    bench_words(data::CORPUS, b);
 }
 
 fn bench_typos(data: &str, b: &mut test::Bencher) {
+    let temp = assert_fs::TempDir::new().unwrap();
+    let sample_path = temp.child("sample");
+    sample_path.write_str(data).unwrap();
+
     let corrections = typos_cli::dict::BuiltIn::new(Default::default());
     let parser = typos::tokens::Tokenizer::new();
     let checks = typos_cli::checks::TyposSettings::new().build_typos();
-    b.iter(|| checks.check_str(data, &parser, &corrections, &typos_cli::report::PrintSilent));
+    b.iter(|| {
+        checks.check_file(
+            sample_path.path(),
+            true,
+            &parser,
+            &corrections,
+            &typos_cli::report::PrintSilent,
+        )
+    });
+
+    temp.close().unwrap();
 }
 
 #[bench]
@@ -161,54 +210,3 @@ fn typos_code(b: &mut test::Bencher) {
 fn typos_corpus(b: &mut test::Bencher) {
     bench_typos(data::CORPUS, b);
 }
-
-fn bench_check_file(data: &str, b: &mut test::Bencher) {
-    let temp = assert_fs::TempDir::new().unwrap();
-    let sample_path = temp.child("sample");
-    sample_path.write_str(data).unwrap();
-
-    let corrections = typos_cli::dict::BuiltIn::new(Default::default());
-    let parser = typos::tokens::Tokenizer::new();
-    let checks = typos_cli::checks::TyposSettings::new().build_typos();
-    b.iter(|| {
-        checks.check_file_content(
-            sample_path.path(),
-            true,
-            &parser,
-            &corrections,
-            &typos_cli::report::PrintSilent,
-        )
-    });
-
-    temp.close().unwrap();
-}
-
-#[bench]
-fn check_file_empty(b: &mut test::Bencher) {
-    bench_check_file(data::EMPTY, b);
-}
-
-#[bench]
-fn check_file_no_tokens(b: &mut test::Bencher) {
-    bench_check_file(data::NO_TOKENS, b);
-}
-
-#[bench]
-fn check_file_single_token(b: &mut test::Bencher) {
-    bench_check_file(data::SINGLE_TOKEN, b);
-}
-
-#[bench]
-fn check_file_sherlock(b: &mut test::Bencher) {
-    bench_check_file(data::SHERLOCK, b);
-}
-
-#[bench]
-fn check_file_code(b: &mut test::Bencher) {
-    bench_check_file(data::CODE, b);
-}
-
-#[bench]
-fn check_file_corpus(b: &mut test::Bencher) {
-    bench_check_file(data::CORPUS, b);
-}
diff --git a/src/checks.rs b/src/checks.rs
index ea8e5cb..65d7e48 100644
--- a/src/checks.rs
+++ b/src/checks.rs
@@ -5,82 +5,6 @@ use typos::tokens;
 use typos::Dictionary;
 
 pub trait Check: Send + Sync {
-    fn check_str(
-        &self,
-        buffer: &str,
-        parser: &tokens::Tokenizer,
-        dictionary: &dyn Dictionary,
-        reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error>;
-
-    fn check_bytes(
-        &self,
-        buffer: &[u8],
-        parser: &tokens::Tokenizer,
-        dictionary: &dyn Dictionary,
-        reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error>;
-
-    fn check_filenames(&self) -> bool;
-
-    fn check_files(&self) -> bool;
-
-    fn binary(&self) -> bool;
-
-    fn check_filename(
-        &self,
-        path: &std::path::Path,
-        parser: &tokens::Tokenizer,
-        dictionary: &dyn Dictionary,
-        reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        if !self.check_filenames() {
-            return Ok(());
-        }
-
-        if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
-            let context_reporter = ReportContext {
-                reporter,
-                context: report::PathContext { path }.into(),
-            };
-            self.check_str(file_name, parser, dictionary, &context_reporter)?;
-        }
-
-        Ok(())
-    }
-
-    fn check_file_content(
-        &self,
-        path: &std::path::Path,
-        explicit: bool,
-        parser: &tokens::Tokenizer,
-        dictionary: &dyn Dictionary,
-        reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        if !self.check_files() {
-            return Ok(());
-        }
-
-        let buffer = read_file(path, reporter)?;
-        let (buffer, content_type) = massage_data(buffer)?;
-        if !explicit && !self.binary() && content_type.is_binary() {
-            let msg = report::BinaryFile { path };
-            reporter.report(msg.into())?;
-            return Ok(());
-        }
-
-        for (line_idx, line) in buffer.lines().enumerate() {
-            let line_num = line_idx + 1;
-            let context_reporter = ReportContext {
-                reporter,
-                context: report::FileContext { path, line_num }.into(),
-            };
-            self.check_bytes(line, parser, dictionary, &context_reporter)?;
-        }
-
-        Ok(())
-    }
-
     fn check_file(
         &self,
         path: &std::path::Path,
@@ -88,23 +12,7 @@ pub trait Check: Send + Sync {
         parser: &tokens::Tokenizer,
         dictionary: &dyn Dictionary,
         reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        self.check_filename(path, parser, dictionary, reporter)?;
-        self.check_file_content(path, explicit, parser, dictionary, reporter)?;
-        Ok(())
-    }
-}
-
-struct ReportContext<'m, 'r> {
-    reporter: &'r dyn report::Report,
-    context: report::Context<'m>,
-}
-
-impl<'m, 'r> report::Report for ReportContext<'m, 'r> {
-    fn report(&self, msg: report::Message) -> Result<(), std::io::Error> {
-        let msg = msg.context(Some(self.context.clone()));
-        self.reporter.report(msg)
-    }
+    ) -> Result<(), std::io::Error>;
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -183,9 +91,10 @@ pub struct Typos {
 }
 
 impl Check for Typos {
-    fn check_str(
+    fn check_file(
         &self,
-        buffer: &str,
+        path: &std::path::Path,
+        explicit: bool,
         tokenizer: &tokens::Tokenizer,
         dictionary: &dyn Dictionary,
         reporter: &dyn report::Report,
@@ -194,54 +103,47 @@ impl Check for Typos {
             .tokenizer(tokenizer)
             .dictionary(dictionary)
             .typos();
-        for typo in parser.parse_str(buffer) {
-            let msg = report::Typo {
-                context: None,
-                buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
-                byte_offset: typo.byte_offset,
-                typo: typo.typo,
-                corrections: typo.corrections,
-            };
-            reporter.report(msg.into())?;
+
+        if self.check_filenames {
+            if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
+                for typo in parser.parse_str(file_name) {
+                    let msg = report::Typo {
+                        context: Some(report::PathContext { path }.into()),
+                        buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()),
+                        byte_offset: typo.byte_offset,
+                        typo: typo.typo,
+                        corrections: typo.corrections,
+                    };
+                    reporter.report(msg.into())?;
+                }
+            }
         }
-        Ok(())
-    }
 
-    fn check_bytes(
-        &self,
-        buffer: &[u8],
-        tokenizer: &tokens::Tokenizer,
-        dictionary: &dyn Dictionary,
-        reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        let parser = typos::ParserBuilder::new()
-            .tokenizer(tokenizer)
-            .dictionary(dictionary)
-            .typos();
-        for typo in parser.parse_bytes(buffer) {
-            let msg = report::Typo {
-                context: None,
-                buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
-                byte_offset: typo.byte_offset,
-                typo: typo.typo,
-                corrections: typo.corrections,
-            };
-            reporter.report(msg.into())?;
+        if self.check_files {
+            let buffer = read_file(path, reporter)?;
+            let (buffer, content_type) = massage_data(buffer)?;
+            if !explicit && !self.binary && content_type.is_binary() {
+                let msg = report::BinaryFile { path };
+                reporter.report(msg.into())?;
+            } else {
+                let mut accum_line_num = AccumulateLineNum::new();
+                for typo in parser.parse_bytes(&buffer) {
+                    let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
+                    let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
+                    let msg = report::Typo {
+                        context: Some(report::FileContext { path, line_num }.into()),
+                        buffer: std::borrow::Cow::Borrowed(line),
+                        byte_offset: line_offset,
+                        typo: typo.typo,
+                        corrections: typo.corrections,
+                    };
+                    reporter.report(msg.into())?;
+                }
+            }
         }
+
         Ok(())
     }
-
-    fn check_filenames(&self) -> bool {
-        self.check_filenames
-    }
-
-    fn check_files(&self) -> bool {
-        self.check_files
-    }
-
-    fn binary(&self) -> bool {
-        self.binary
-    }
 }
 
 #[derive(Debug, Clone)]
@@ -252,26 +154,6 @@ pub struct Identifiers {
 }
 
 impl Check for Identifiers {
-    fn check_str(
-        &self,
-        _buffer: &str,
-        _tokenizer: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
-    fn check_bytes(
-        &self,
-        _buffer: &[u8],
-        _tokenizer: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
     fn check_file(
         &self,
         path: &std::path::Path,
@@ -284,7 +166,7 @@ impl Check for Identifiers {
             .tokenizer(tokenizer)
             .identifiers();
 
-        if self.check_filenames() {
+        if self.check_filenames {
             if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
                 for word in parser.parse_str(file_name) {
                     let msg = report::Parse {
@@ -297,16 +179,20 @@ impl Check for Identifiers {
             }
         }
 
-        if self.check_files() {
+        if self.check_files {
             let buffer = read_file(path, reporter)?;
             let (buffer, content_type) = massage_data(buffer)?;
-            if !explicit && !self.binary() && content_type.is_binary() {
+            if !explicit && !self.binary && content_type.is_binary() {
                 let msg = report::BinaryFile { path };
                 reporter.report(msg.into())?;
             } else {
                 for word in parser.parse_bytes(&buffer) {
+                    // HACK: Don't look up the line_num per entry to better match the performance
+                    // of Typos for comparison purposes.  We don't really get much out of it
+                    // anyway.
+                    let line_num = 0;
                     let msg = report::Parse {
-                        context: Some(report::FileContext { path, line_num: 0 }.into()),
+                        context: Some(report::FileContext { path, line_num }.into()),
                         kind: report::ParseKind::Identifier,
                         data: word.token(),
                     };
@@ -317,18 +203,6 @@ impl Check for Identifiers {
 
         Ok(())
     }
-
-    fn check_filenames(&self) -> bool {
-        self.check_filenames
-    }
-
-    fn check_files(&self) -> bool {
-        self.check_files
-    }
-
-    fn binary(&self) -> bool {
-        self.binary
-    }
 }
 
 #[derive(Debug, Clone)]
@@ -339,26 +213,6 @@ pub struct Words {
 }
 
 impl Check for Words {
-    fn check_str(
-        &self,
-        _buffer: &str,
-        _tokenizer: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
-    fn check_bytes(
-        &self,
-        _buffer: &[u8],
-        _tokenizer: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
     fn check_file(
         &self,
         path: &std::path::Path,
@@ -369,7 +223,7 @@ impl Check for Words {
     ) -> Result<(), std::io::Error> {
         let parser = typos::ParserBuilder::new().tokenizer(tokenizer).words();
 
-        if self.check_filenames() {
+        if self.check_filenames {
             if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
                 for word in parser.parse_str(file_name) {
                     let msg = report::Parse {
@@ -382,16 +236,20 @@ impl Check for Words {
             }
         }
 
-        if self.check_files() {
+        if self.check_files {
             let buffer = read_file(path, reporter)?;
             let (buffer, content_type) = massage_data(buffer)?;
-            if !explicit && !self.binary() && content_type.is_binary() {
+            if !explicit && !self.binary && content_type.is_binary() {
                 let msg = report::BinaryFile { path };
                 reporter.report(msg.into())?;
             } else {
                 for word in parser.parse_bytes(&buffer) {
+                    // HACK: Don't look up the line_num per entry to better match the performance
+                    // of Typos for comparison purposes.  We don't really get much out of it
+                    // anyway.
+                    let line_num = 0;
                     let msg = report::Parse {
-                        context: Some(report::FileContext { path, line_num: 0 }.into()),
+                        context: Some(report::FileContext { path, line_num }.into()),
                         kind: report::ParseKind::Word,
                         data: word.token(),
                     };
@@ -402,18 +260,6 @@ impl Check for Words {
 
         Ok(())
     }
-
-    fn check_filenames(&self) -> bool {
-        self.check_filenames
-    }
-
-    fn check_files(&self) -> bool {
-        self.check_files
-    }
-
-    fn binary(&self) -> bool {
-        self.binary
-    }
 }
 
 #[derive(Debug, Clone)]
@@ -422,59 +268,6 @@ pub struct FoundFiles {
 }
 
 impl Check for FoundFiles {
-    fn check_str(
-        &self,
-        _buffer: &str,
-        _parser: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
-    fn check_bytes(
-        &self,
-        _buffer: &[u8],
-        _parser: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
-    fn check_filenames(&self) -> bool {
-        true
-    }
-
-    fn check_files(&self) -> bool {
-        true
-    }
-
-    fn binary(&self) -> bool {
-        self.binary
-    }
-
-    fn check_filename(
-        &self,
-        _path: &std::path::Path,
-        _parser: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
-    fn check_file_content(
-        &self,
-        _path: &std::path::Path,
-        _explicit: bool,
-        _parser: &tokens::Tokenizer,
-        _dictionary: &dyn Dictionary,
-        _reporter: &dyn report::Report,
-    ) -> Result<(), std::io::Error> {
-        Ok(())
-    }
-
     fn check_file(
         &self,
         path: &std::path::Path,
@@ -533,6 +326,45 @@ fn massage_data(
     Ok((buffer, content_type))
 }
 
+struct AccumulateLineNum {
+    line_num: usize,
+    last_offset: usize,
+}
+
+impl AccumulateLineNum {
+    fn new() -> Self {
+        Self {
+            // 1-indexed
+            line_num: 1,
+            last_offset: 0,
+        }
+    }
+
+    fn line_num(&mut self, buffer: &[u8], byte_offset: usize) -> usize {
+        assert!(self.last_offset <= byte_offset);
+        let slice = &buffer[self.last_offset..byte_offset];
+        let newlines = slice.lines().count();
+        let line_num = self.line_num + newlines;
+        self.line_num = line_num;
+        self.last_offset = byte_offset;
+        line_num
+    }
+}
+
+fn extract_line(buffer: &[u8], byte_offset: usize) -> (&[u8], usize) {
+    let line_start = buffer[0..byte_offset]
+        .rfind_byte(b'\n')
+        // Skip the newline
+        .map(|s| s + 1)
+        .unwrap_or(0);
+    let line = buffer[line_start..]
+        .lines()
+        .next()
+        .expect("should always be at least a line");
+    let line_offset = byte_offset - line_start;
+    (line, line_offset)
+}
+
 pub fn check_path(
     walk: ignore::Walk,
     checks: &dyn Check,