perf: Faster binary-file detection

This switches us from a homegrown implementation to `context_inspector`
- Adds some optimizations by looking for the BoM.
- We used the same algorithm for finding Null bytes
- `context_inspector` caps how much of the buffer is searche though

Besides performance, `content_inspector` also has some known-binary
magic numbers to avoid bad detections.

Fixes #34
This commit is contained in:
Ed Page 2020-08-21 14:28:59 -05:00
parent 443aa5c4fe
commit a63dfa0f8c
3 changed files with 45 additions and 17 deletions

10
Cargo.lock generated
View file

@ -149,6 +149,15 @@ dependencies = [
"unicase", "unicase",
] ]
[[package]]
name = "content_inspector"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7bda66e858c683005a53a9a60c69a4aca7eeaa45d124526e389f7aec8e62f38"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.7.2" version = "0.7.2"
@ -969,6 +978,7 @@ version = "0.3.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bstr", "bstr",
"content_inspector",
"derive_more 0.99.9", "derive_more 0.99.9",
"derive_setters", "derive_setters",
"itertools", "itertools",

View file

@ -27,3 +27,4 @@ log = "0.4"
unicode-segmentation = "1.6.0" unicode-segmentation = "1.6.0"
derive_more = "0.99.9" derive_more = "0.99.9"
derive_setters = "0.1" derive_setters = "0.1"
content_inspector = "0.2.4"

View file

@ -113,11 +113,18 @@ impl ParseIdentifiers {
let buffer = std::fs::read(path) let buffer = std::fs::read(path)
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
if !explicit && !self.binary && is_binary(&buffer) { if !explicit && !self.binary {
let content_type = content_inspector::inspect(&buffer);
if content_type.is_binary()
// HACK: We only support UTF-8 at the moment
|| (content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8)
{
let msg = report::BinaryFile { path }; let msg = report::BinaryFile { path };
reporter.report(msg.into()); reporter.report(msg.into());
return Ok(typos_found); return Ok(typos_found);
} }
}
for line in buffer.lines() { for line in buffer.lines() {
let msg = report::Parse { let msg = report::Parse {
@ -182,11 +189,18 @@ impl ParseWords {
let buffer = std::fs::read(path) let buffer = std::fs::read(path)
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
if !explicit && !self.binary && is_binary(&buffer) { if !explicit && !self.binary {
let content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type.is_binary()
|| (content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8)
{
let msg = report::BinaryFile { path }; let msg = report::BinaryFile { path };
reporter.report(msg.into()); reporter.report(msg.into());
return Ok(typos_found); return Ok(typos_found);
} }
}
for line in buffer.lines() { for line in buffer.lines() {
let msg = report::Parse { let msg = report::Parse {
@ -274,11 +288,19 @@ impl Checks {
let buffer = std::fs::read(path) let buffer = std::fs::read(path)
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?; .map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
if !explicit && !self.binary && is_binary(&buffer) { if !explicit && !self.binary {
let content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type.is_binary()
|| (content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8)
{
// HACK: we don't support alternative encodings atm
let msg = report::BinaryFile { path }; let msg = report::BinaryFile { path };
reporter.report(msg.into()); reporter.report(msg.into());
return Ok(typos_found); return Ok(typos_found);
} }
}
for (line_idx, line) in buffer.lines().enumerate() { for (line_idx, line) in buffer.lines().enumerate() {
let line_num = line_idx + 1; let line_num = line_idx + 1;
@ -318,8 +340,3 @@ impl Checks {
Ok(typos_found) Ok(typos_found)
} }
} }
fn is_binary(buffer: &[u8]) -> bool {
let null_max = std::cmp::min(buffer.len(), 1024);
buffer[0..null_max].find_byte(b'\0').is_some()
}