mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-22 00:51:11 -05:00
feat: Check and replace UTF-16 files
We don't have good detection for non-UTF encodings and don't have encoding support for UTF-32, so limiting it to just UTF-16. Fixes #17
This commit is contained in:
parent
1c392c2606
commit
998fad4390
4 changed files with 138 additions and 26 deletions
65
Cargo.lock
generated
65
Cargo.lock
generated
|
@ -351,6 +351,70 @@ version = "1.6.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding"
|
||||||
|
version = "0.2.33"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
|
||||||
|
dependencies = [
|
||||||
|
"encoding-index-japanese",
|
||||||
|
"encoding-index-korean",
|
||||||
|
"encoding-index-simpchinese",
|
||||||
|
"encoding-index-singlebyte",
|
||||||
|
"encoding-index-tradchinese",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding-index-japanese"
|
||||||
|
version = "1.20141219.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_index_tests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding-index-korean"
|
||||||
|
version = "1.20141219.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_index_tests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding-index-simpchinese"
|
||||||
|
version = "1.20141219.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_index_tests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding-index-singlebyte"
|
||||||
|
version = "1.20141219.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_index_tests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding-index-tradchinese"
|
||||||
|
version = "1.20141219.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_index_tests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding_index_tests"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "enumflags2"
|
name = "enumflags2"
|
||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
|
@ -1177,6 +1241,7 @@ dependencies = [
|
||||||
"derive_more 0.99.11",
|
"derive_more 0.99.11",
|
||||||
"derive_setters",
|
"derive_setters",
|
||||||
"difflib",
|
"difflib",
|
||||||
|
"encoding",
|
||||||
"env_logger 0.8.2",
|
"env_logger 0.8.2",
|
||||||
"human-panic",
|
"human-panic",
|
||||||
"ignore",
|
"ignore",
|
||||||
|
|
|
@ -56,6 +56,7 @@ derive_more = "0.99.11"
|
||||||
derive_setters = "0.1"
|
derive_setters = "0.1"
|
||||||
itertools = "0.9"
|
itertools = "0.9"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
|
encoding = "0.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
assert_fs = "1.0"
|
assert_fs = "1.0"
|
||||||
|
|
|
@ -44,7 +44,7 @@ See also [benchmarks](../benchsuite/runs).
|
||||||
| snake_case | Yes | No | ? | No | Yes |
|
| snake_case | Yes | No | ? | No | Yes |
|
||||||
| Ignore Hex | Yes | No | ? | No | Yes |
|
| Ignore Hex | Yes | No | ? | No | Yes |
|
||||||
| C-Escapes | No ([#20][def-3]) | No | ? | No | Yes |
|
| C-Escapes | No ([#20][def-3]) | No | ? | No | Yes |
|
||||||
| Encodings | UTF-8 ([#17][def-17]) | UTF-8 | ? | Auto | Auto |
|
| Encodings | UTF-8 / UTF-16 | UTF-8 | ? | Auto | Auto |
|
||||||
| Whole-project | Yes | Yes | Yes | Yes | No |
|
| Whole-project | Yes | Yes | Yes | Yes | No |
|
||||||
| Ignores hidden | Yes | Yes | ? | Yes | No |
|
| Ignores hidden | Yes | Yes | ? | Yes | No |
|
||||||
| Respect gitignore | Yes | Yes | ? | No | No |
|
| Respect gitignore | Yes | Yes | ? | No | No |
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
use bstr::ByteSlice;
|
use bstr::ByteSlice;
|
||||||
|
use encoding::Encoding;
|
||||||
|
|
||||||
use crate::report;
|
use crate::report;
|
||||||
use typos::tokens;
|
use typos::tokens;
|
||||||
|
@ -208,7 +209,7 @@ impl FileChecker for FixTypos {
|
||||||
}
|
}
|
||||||
if !fixes.is_empty() {
|
if !fixes.is_empty() {
|
||||||
let buffer = fix_buffer(buffer, fixes.into_iter());
|
let buffer = fix_buffer(buffer, fixes.into_iter());
|
||||||
write_file(path, content_type, &buffer, reporter)?;
|
write_file(path, content_type, buffer, reporter)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -503,22 +504,30 @@ pub fn read_file(
|
||||||
path: &std::path::Path,
|
path: &std::path::Path,
|
||||||
reporter: &dyn report::Report,
|
reporter: &dyn report::Report,
|
||||||
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
|
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
|
||||||
let buffer = match std::fs::read(path) {
|
let buffer = report_error(std::fs::read(path), reporter)?;
|
||||||
Ok(buffer) => buffer,
|
|
||||||
Err(err) => {
|
|
||||||
let msg = report::Error::new(err.to_string());
|
|
||||||
reporter.report(msg.into())?;
|
|
||||||
Vec::new()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut content_type = content_inspector::inspect(&buffer);
|
let content_type = content_inspector::inspect(&buffer);
|
||||||
// HACK: We only support UTF-8 at the moment
|
|
||||||
if content_type != content_inspector::ContentType::UTF_8_BOM
|
let (buffer, content_type) = match content_type {
|
||||||
&& content_type != content_inspector::ContentType::UTF_8
|
content_inspector::ContentType::BINARY |
|
||||||
{
|
// HACK: We don't support UTF-32 yet
|
||||||
content_type = content_inspector::ContentType::BINARY;
|
content_inspector::ContentType::UTF_32LE |
|
||||||
|
content_inspector::ContentType::UTF_32BE => {
|
||||||
|
(buffer, content_inspector::ContentType::BINARY)
|
||||||
|
},
|
||||||
|
content_inspector::ContentType::UTF_8 |
|
||||||
|
content_inspector::ContentType::UTF_8_BOM => {
|
||||||
|
(buffer, content_type)
|
||||||
|
},
|
||||||
|
content_inspector::ContentType::UTF_16LE => {
|
||||||
|
let buffer = report_error(encoding::all::UTF_16LE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
|
||||||
|
(buffer.into_bytes(), content_type)
|
||||||
}
|
}
|
||||||
|
content_inspector::ContentType::UTF_16BE => {
|
||||||
|
let buffer = report_error(encoding::all::UTF_16BE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
|
||||||
|
(buffer.into_bytes(), content_type)
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
Ok((buffer, content_type))
|
Ok((buffer, content_type))
|
||||||
}
|
}
|
||||||
|
@ -526,22 +535,59 @@ pub fn read_file(
|
||||||
pub fn write_file(
|
pub fn write_file(
|
||||||
path: &std::path::Path,
|
path: &std::path::Path,
|
||||||
content_type: content_inspector::ContentType,
|
content_type: content_inspector::ContentType,
|
||||||
buffer: &[u8],
|
buffer: Vec<u8>,
|
||||||
reporter: &dyn report::Report,
|
reporter: &dyn report::Report,
|
||||||
) -> Result<(), std::io::Error> {
|
) -> Result<(), std::io::Error> {
|
||||||
assert!(
|
let buffer = match content_type {
|
||||||
content_type == content_inspector::ContentType::UTF_8_BOM
|
// HACK: We don't support UTF-32 yet
|
||||||
|| content_type == content_inspector::ContentType::UTF_8
|
content_inspector::ContentType::UTF_32LE | content_inspector::ContentType::UTF_32BE => {
|
||||||
|| content_type == content_inspector::ContentType::BINARY
|
unreachable!("read_file should prevent these from being passed along");
|
||||||
);
|
}
|
||||||
match std::fs::write(path, buffer) {
|
content_inspector::ContentType::BINARY
|
||||||
Ok(()) => (),
|
| content_inspector::ContentType::UTF_8
|
||||||
|
| content_inspector::ContentType::UTF_8_BOM => buffer,
|
||||||
|
content_inspector::ContentType::UTF_16LE => {
|
||||||
|
let buffer = report_error(String::from_utf8(buffer), reporter)?;
|
||||||
|
if buffer.is_empty() {
|
||||||
|
// Error occurred, don't clear out the file
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
report_error(
|
||||||
|
encoding::all::UTF_16LE.encode(&buffer, encoding::EncoderTrap::Strict),
|
||||||
|
reporter,
|
||||||
|
)?
|
||||||
|
}
|
||||||
|
content_inspector::ContentType::UTF_16BE => {
|
||||||
|
let buffer = report_error(String::from_utf8(buffer), reporter)?;
|
||||||
|
if buffer.is_empty() {
|
||||||
|
// Error occurred, don't clear out the file
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
report_error(
|
||||||
|
encoding::all::UTF_16BE.encode(&buffer, encoding::EncoderTrap::Strict),
|
||||||
|
reporter,
|
||||||
|
)?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
report_error(std::fs::write(path, buffer), reporter)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn report_error<T: Default, E: ToString>(
|
||||||
|
value: Result<T, E>,
|
||||||
|
reporter: &dyn report::Report,
|
||||||
|
) -> Result<T, std::io::Error> {
|
||||||
|
let buffer = match value {
|
||||||
|
Ok(value) => value,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
let msg = report::Error::new(err.to_string());
|
let msg = report::Error::new(err.to_string());
|
||||||
reporter.report(msg.into())?;
|
reporter.report(msg.into())?;
|
||||||
|
Default::default()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(())
|
Ok(buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
struct AccumulateLineNum {
|
struct AccumulateLineNum {
|
||||||
|
|
Loading…
Reference in a new issue