mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-22 00:51:11 -05:00
perf: Faster binary-file detection
This switches us from a homegrown implementation to `context_inspector` - Adds some optimizations by looking for the BoM. - We used the same algorithm for finding Null bytes - `context_inspector` caps how much of the buffer is searche though Besides performance, `content_inspector` also has some known-binary magic numbers to avoid bad detections. Fixes #34
This commit is contained in:
parent
443aa5c4fe
commit
a63dfa0f8c
3 changed files with 45 additions and 17 deletions
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -149,6 +149,15 @@ dependencies = [
|
||||||
"unicase",
|
"unicase",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "content_inspector"
|
||||||
|
version = "0.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b7bda66e858c683005a53a9a60c69a4aca7eeaa45d124526e389f7aec8e62f38"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-utils"
|
name = "crossbeam-utils"
|
||||||
version = "0.7.2"
|
version = "0.7.2"
|
||||||
|
@ -969,6 +978,7 @@ version = "0.3.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bstr",
|
"bstr",
|
||||||
|
"content_inspector",
|
||||||
"derive_more 0.99.9",
|
"derive_more 0.99.9",
|
||||||
"derive_setters",
|
"derive_setters",
|
||||||
"itertools",
|
"itertools",
|
||||||
|
|
|
@ -27,3 +27,4 @@ log = "0.4"
|
||||||
unicode-segmentation = "1.6.0"
|
unicode-segmentation = "1.6.0"
|
||||||
derive_more = "0.99.9"
|
derive_more = "0.99.9"
|
||||||
derive_setters = "0.1"
|
derive_setters = "0.1"
|
||||||
|
content_inspector = "0.2.4"
|
||||||
|
|
|
@ -113,11 +113,18 @@ impl ParseIdentifiers {
|
||||||
|
|
||||||
let buffer = std::fs::read(path)
|
let buffer = std::fs::read(path)
|
||||||
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
||||||
if !explicit && !self.binary && is_binary(&buffer) {
|
if !explicit && !self.binary {
|
||||||
|
let content_type = content_inspector::inspect(&buffer);
|
||||||
|
if content_type.is_binary()
|
||||||
|
// HACK: We only support UTF-8 at the moment
|
||||||
|
|| (content_type != content_inspector::ContentType::UTF_8_BOM
|
||||||
|
&& content_type != content_inspector::ContentType::UTF_8)
|
||||||
|
{
|
||||||
let msg = report::BinaryFile { path };
|
let msg = report::BinaryFile { path };
|
||||||
reporter.report(msg.into());
|
reporter.report(msg.into());
|
||||||
return Ok(typos_found);
|
return Ok(typos_found);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for line in buffer.lines() {
|
for line in buffer.lines() {
|
||||||
let msg = report::Parse {
|
let msg = report::Parse {
|
||||||
|
@ -182,11 +189,18 @@ impl ParseWords {
|
||||||
|
|
||||||
let buffer = std::fs::read(path)
|
let buffer = std::fs::read(path)
|
||||||
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
||||||
if !explicit && !self.binary && is_binary(&buffer) {
|
if !explicit && !self.binary {
|
||||||
|
let content_type = content_inspector::inspect(&buffer);
|
||||||
|
// HACK: We only support UTF-8 at the moment
|
||||||
|
if content_type.is_binary()
|
||||||
|
|| (content_type != content_inspector::ContentType::UTF_8_BOM
|
||||||
|
&& content_type != content_inspector::ContentType::UTF_8)
|
||||||
|
{
|
||||||
let msg = report::BinaryFile { path };
|
let msg = report::BinaryFile { path };
|
||||||
reporter.report(msg.into());
|
reporter.report(msg.into());
|
||||||
return Ok(typos_found);
|
return Ok(typos_found);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for line in buffer.lines() {
|
for line in buffer.lines() {
|
||||||
let msg = report::Parse {
|
let msg = report::Parse {
|
||||||
|
@ -274,11 +288,19 @@ impl Checks {
|
||||||
|
|
||||||
let buffer = std::fs::read(path)
|
let buffer = std::fs::read(path)
|
||||||
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
|
||||||
if !explicit && !self.binary && is_binary(&buffer) {
|
if !explicit && !self.binary {
|
||||||
|
let content_type = content_inspector::inspect(&buffer);
|
||||||
|
// HACK: We only support UTF-8 at the moment
|
||||||
|
if content_type.is_binary()
|
||||||
|
|| (content_type != content_inspector::ContentType::UTF_8_BOM
|
||||||
|
&& content_type != content_inspector::ContentType::UTF_8)
|
||||||
|
{
|
||||||
|
// HACK: we don't support alternative encodings atm
|
||||||
let msg = report::BinaryFile { path };
|
let msg = report::BinaryFile { path };
|
||||||
reporter.report(msg.into());
|
reporter.report(msg.into());
|
||||||
return Ok(typos_found);
|
return Ok(typos_found);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (line_idx, line) in buffer.lines().enumerate() {
|
for (line_idx, line) in buffer.lines().enumerate() {
|
||||||
let line_num = line_idx + 1;
|
let line_num = line_idx + 1;
|
||||||
|
@ -318,8 +340,3 @@ impl Checks {
|
||||||
Ok(typos_found)
|
Ok(typos_found)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_binary(buffer: &[u8]) -> bool {
|
|
||||||
let null_max = std::cmp::min(buffer.len(), 1024);
|
|
||||||
buffer[0..null_max].find_byte(b'\0').is_some()
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in a new issue