mirror of
https://github.com/crate-ci/typos.git
synced 2024-12-22 15:42:23 -05:00
perf: Faster binary-file detection
This switches us from a homegrown implementation to `context_inspector` - Adds some optimizations by looking for the BoM. - We used the same algorithm for finding Null bytes - `context_inspector` caps how much of the buffer is searche though Besides performance, `content_inspector` also has some known-binary magic numbers to avoid bad detections. Fixes #34
This commit is contained in:
parent
443aa5c4fe
commit
a63dfa0f8c
3 changed files with 45 additions and 17 deletions
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -149,6 +149,15 @@ dependencies = [
|
|||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "content_inspector"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7bda66e858c683005a53a9a60c69a4aca7eeaa45d124526e389f7aec8e62f38"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.7.2"
|
||||
|
@ -969,6 +978,7 @@ version = "0.3.0"
|
|||
dependencies = [
|
||||
"anyhow",
|
||||
"bstr",
|
||||
"content_inspector",
|
||||
"derive_more 0.99.9",
|
||||
"derive_setters",
|
||||
"itertools",
|
||||
|
|
|
@ -27,3 +27,4 @@ log = "0.4"
|
|||
unicode-segmentation = "1.6.0"
|
||||
derive_more = "0.99.9"
|
||||
derive_setters = "0.1"
|
||||
content_inspector = "0.2.4"
|
||||
|
|
|
@ -113,10 +113,17 @@ impl ParseIdentifiers {
|
|||
|
||||
let buffer = std::fs::read(path)
|
||||
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
||||
if !explicit && !self.binary && is_binary(&buffer) {
|
||||
let msg = report::BinaryFile { path };
|
||||
reporter.report(msg.into());
|
||||
return Ok(typos_found);
|
||||
if !explicit && !self.binary {
|
||||
let content_type = content_inspector::inspect(&buffer);
|
||||
if content_type.is_binary()
|
||||
// HACK: We only support UTF-8 at the moment
|
||||
|| (content_type != content_inspector::ContentType::UTF_8_BOM
|
||||
&& content_type != content_inspector::ContentType::UTF_8)
|
||||
{
|
||||
let msg = report::BinaryFile { path };
|
||||
reporter.report(msg.into());
|
||||
return Ok(typos_found);
|
||||
}
|
||||
}
|
||||
|
||||
for line in buffer.lines() {
|
||||
|
@ -182,10 +189,17 @@ impl ParseWords {
|
|||
|
||||
let buffer = std::fs::read(path)
|
||||
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
||||
if !explicit && !self.binary && is_binary(&buffer) {
|
||||
let msg = report::BinaryFile { path };
|
||||
reporter.report(msg.into());
|
||||
return Ok(typos_found);
|
||||
if !explicit && !self.binary {
|
||||
let content_type = content_inspector::inspect(&buffer);
|
||||
// HACK: We only support UTF-8 at the moment
|
||||
if content_type.is_binary()
|
||||
|| (content_type != content_inspector::ContentType::UTF_8_BOM
|
||||
&& content_type != content_inspector::ContentType::UTF_8)
|
||||
{
|
||||
let msg = report::BinaryFile { path };
|
||||
reporter.report(msg.into());
|
||||
return Ok(typos_found);
|
||||
}
|
||||
}
|
||||
|
||||
for line in buffer.lines() {
|
||||
|
@ -274,10 +288,18 @@ impl Checks {
|
|||
|
||||
let buffer = std::fs::read(path)
|
||||
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
||||
if !explicit && !self.binary && is_binary(&buffer) {
|
||||
let msg = report::BinaryFile { path };
|
||||
reporter.report(msg.into());
|
||||
return Ok(typos_found);
|
||||
if !explicit && !self.binary {
|
||||
let content_type = content_inspector::inspect(&buffer);
|
||||
// HACK: We only support UTF-8 at the moment
|
||||
if content_type.is_binary()
|
||||
|| (content_type != content_inspector::ContentType::UTF_8_BOM
|
||||
&& content_type != content_inspector::ContentType::UTF_8)
|
||||
{
|
||||
// HACK: we don't support alternative encodings atm
|
||||
let msg = report::BinaryFile { path };
|
||||
reporter.report(msg.into());
|
||||
return Ok(typos_found);
|
||||
}
|
||||
}
|
||||
|
||||
for (line_idx, line) in buffer.lines().enumerate() {
|
||||
|
@ -318,8 +340,3 @@ impl Checks {
|
|||
Ok(typos_found)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_binary(buffer: &[u8]) -> bool {
|
||||
let null_max = std::cmp::min(buffer.len(), 1024);
|
||||
buffer[0..null_max].find_byte(b'\0').is_some()
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue