From a63dfa0f8c5094969ca60ecc9ff072a5bf8594ff Mon Sep 17 00:00:00 2001 From: Ed Page Date: Fri, 21 Aug 2020 14:28:59 -0500 Subject: [PATCH] perf: Faster binary-file detection This switches us from a homegrown implementation to `context_inspector` - Adds some optimizations by looking for the BoM. - We used the same algorithm for finding Null bytes - `context_inspector` caps how much of the buffer is searche though Besides performance, `content_inspector` also has some known-binary magic numbers to avoid bad detections. Fixes #34 --- Cargo.lock | 10 ++++++++ crates/typos/Cargo.toml | 1 + crates/typos/src/checks.rs | 51 +++++++++++++++++++++++++------------- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bdf79a3..a763451 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -149,6 +149,15 @@ dependencies = [ "unicase", ] +[[package]] +name = "content_inspector" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7bda66e858c683005a53a9a60c69a4aca7eeaa45d124526e389f7aec8e62f38" +dependencies = [ + "memchr", +] + [[package]] name = "crossbeam-utils" version = "0.7.2" @@ -969,6 +978,7 @@ version = "0.3.0" dependencies = [ "anyhow", "bstr", + "content_inspector", "derive_more 0.99.9", "derive_setters", "itertools", diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index 816a375..2ef20c7 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -27,3 +27,4 @@ log = "0.4" unicode-segmentation = "1.6.0" derive_more = "0.99.9" derive_setters = "0.1" +content_inspector = "0.2.4" diff --git a/crates/typos/src/checks.rs b/crates/typos/src/checks.rs index ee32f83..87ccca5 100644 --- a/crates/typos/src/checks.rs +++ b/crates/typos/src/checks.rs @@ -113,10 +113,17 @@ impl ParseIdentifiers { let buffer = std::fs::read(path) .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; - if !explicit && !self.binary && is_binary(&buffer) { - let msg = report::BinaryFile { path }; - reporter.report(msg.into()); - return Ok(typos_found); + if !explicit && !self.binary { + let content_type = content_inspector::inspect(&buffer); + if content_type.is_binary() + // HACK: We only support UTF-8 at the moment + || (content_type != content_inspector::ContentType::UTF_8_BOM + && content_type != content_inspector::ContentType::UTF_8) + { + let msg = report::BinaryFile { path }; + reporter.report(msg.into()); + return Ok(typos_found); + } } for line in buffer.lines() { @@ -182,10 +189,17 @@ impl ParseWords { let buffer = std::fs::read(path) .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; - if !explicit && !self.binary && is_binary(&buffer) { - let msg = report::BinaryFile { path }; - reporter.report(msg.into()); - return Ok(typos_found); + if !explicit && !self.binary { + let content_type = content_inspector::inspect(&buffer); + // HACK: We only support UTF-8 at the moment + if content_type.is_binary() + || (content_type != content_inspector::ContentType::UTF_8_BOM + && content_type != content_inspector::ContentType::UTF_8) + { + let msg = report::BinaryFile { path }; + reporter.report(msg.into()); + return Ok(typos_found); + } } for line in buffer.lines() { @@ -274,10 +288,18 @@ impl Checks { let buffer = std::fs::read(path) .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; - if !explicit && !self.binary && is_binary(&buffer) { - let msg = report::BinaryFile { path }; - reporter.report(msg.into()); - return Ok(typos_found); + if !explicit && !self.binary { + let content_type = content_inspector::inspect(&buffer); + // HACK: We only support UTF-8 at the moment + if content_type.is_binary() + || (content_type != content_inspector::ContentType::UTF_8_BOM + && content_type != content_inspector::ContentType::UTF_8) + { + // HACK: we don't support alternative encodings atm + let msg = report::BinaryFile { path }; + reporter.report(msg.into()); + return Ok(typos_found); + } } for (line_idx, line) in buffer.lines().enumerate() { @@ -318,8 +340,3 @@ impl Checks { Ok(typos_found) } } - -fn is_binary(buffer: &[u8]) -> bool { - let null_max = std::cmp::min(buffer.len(), 1024); - buffer[0..null_max].find_byte(b'\0').is_some() -}