feat: Check and replace UTF-16 files

We don't have good detection for non-UTF encodings and don't have
encoding support for UTF-32, so limiting it to just UTF-16.

Fixes #17
This commit is contained in:
Ed Page 2020-11-03 19:52:39 -06:00
parent 1c392c2606
commit 998fad4390
4 changed files with 138 additions and 26 deletions

65
Cargo.lock generated
View file

@ -351,6 +351,70 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encoding"
version = "0.2.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
dependencies = [
"encoding-index-japanese",
"encoding-index-korean",
"encoding-index-simpchinese",
"encoding-index-singlebyte",
"encoding-index-tradchinese",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]] [[package]]
name = "enumflags2" name = "enumflags2"
version = "0.6.4" version = "0.6.4"
@ -1177,6 +1241,7 @@ dependencies = [
"derive_more 0.99.11", "derive_more 0.99.11",
"derive_setters", "derive_setters",
"difflib", "difflib",
"encoding",
"env_logger 0.8.2", "env_logger 0.8.2",
"human-panic", "human-panic",
"ignore", "ignore",

View file

@ -56,6 +56,7 @@ derive_more = "0.99.11"
derive_setters = "0.1" derive_setters = "0.1"
itertools = "0.9" itertools = "0.9"
serde_json = "1.0" serde_json = "1.0"
encoding = "0.2"
[dev-dependencies] [dev-dependencies]
assert_fs = "1.0" assert_fs = "1.0"

View file

@ -44,7 +44,7 @@ See also [benchmarks](../benchsuite/runs).
| snake_case | Yes | No | ? | No | Yes | | snake_case | Yes | No | ? | No | Yes |
| Ignore Hex | Yes | No | ? | No | Yes | | Ignore Hex | Yes | No | ? | No | Yes |
| C-Escapes | No ([#20][def-3]) | No | ? | No | Yes | | C-Escapes | No ([#20][def-3]) | No | ? | No | Yes |
| Encodings | UTF-8 ([#17][def-17]) | UTF-8 | ? | Auto | Auto | | Encodings | UTF-8 / UTF-16 | UTF-8 | ? | Auto | Auto |
| Whole-project | Yes | Yes | Yes | Yes | No | | Whole-project | Yes | Yes | Yes | Yes | No |
| Ignores hidden | Yes | Yes | ? | Yes | No | | Ignores hidden | Yes | Yes | ? | Yes | No |
| Respect gitignore | Yes | Yes | ? | No | No | | Respect gitignore | Yes | Yes | ? | No | No |

View file

@ -1,4 +1,5 @@
use bstr::ByteSlice; use bstr::ByteSlice;
use encoding::Encoding;
use crate::report; use crate::report;
use typos::tokens; use typos::tokens;
@ -208,7 +209,7 @@ impl FileChecker for FixTypos {
} }
if !fixes.is_empty() { if !fixes.is_empty() {
let buffer = fix_buffer(buffer, fixes.into_iter()); let buffer = fix_buffer(buffer, fixes.into_iter());
write_file(path, content_type, &buffer, reporter)?; write_file(path, content_type, buffer, reporter)?;
} }
} }
} }
@ -503,22 +504,30 @@ pub fn read_file(
path: &std::path::Path, path: &std::path::Path,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> { ) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
let buffer = match std::fs::read(path) { let buffer = report_error(std::fs::read(path), reporter)?;
Ok(buffer) => buffer,
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
Vec::new()
}
};
let mut content_type = content_inspector::inspect(&buffer); let content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type != content_inspector::ContentType::UTF_8_BOM let (buffer, content_type) = match content_type {
&& content_type != content_inspector::ContentType::UTF_8 content_inspector::ContentType::BINARY |
{ // HACK: We don't support UTF-32 yet
content_type = content_inspector::ContentType::BINARY; content_inspector::ContentType::UTF_32LE |
} content_inspector::ContentType::UTF_32BE => {
(buffer, content_inspector::ContentType::BINARY)
},
content_inspector::ContentType::UTF_8 |
content_inspector::ContentType::UTF_8_BOM => {
(buffer, content_type)
},
content_inspector::ContentType::UTF_16LE => {
let buffer = report_error(encoding::all::UTF_16LE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
(buffer.into_bytes(), content_type)
}
content_inspector::ContentType::UTF_16BE => {
let buffer = report_error(encoding::all::UTF_16BE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
(buffer.into_bytes(), content_type)
},
};
Ok((buffer, content_type)) Ok((buffer, content_type))
} }
@ -526,22 +535,59 @@ pub fn read_file(
pub fn write_file( pub fn write_file(
path: &std::path::Path, path: &std::path::Path,
content_type: content_inspector::ContentType, content_type: content_inspector::ContentType,
buffer: &[u8], buffer: Vec<u8>,
reporter: &dyn report::Report, reporter: &dyn report::Report,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
assert!( let buffer = match content_type {
content_type == content_inspector::ContentType::UTF_8_BOM // HACK: We don't support UTF-32 yet
|| content_type == content_inspector::ContentType::UTF_8 content_inspector::ContentType::UTF_32LE | content_inspector::ContentType::UTF_32BE => {
|| content_type == content_inspector::ContentType::BINARY unreachable!("read_file should prevent these from being passed along");
); }
match std::fs::write(path, buffer) { content_inspector::ContentType::BINARY
Ok(()) => (), | content_inspector::ContentType::UTF_8
| content_inspector::ContentType::UTF_8_BOM => buffer,
content_inspector::ContentType::UTF_16LE => {
let buffer = report_error(String::from_utf8(buffer), reporter)?;
if buffer.is_empty() {
// Error occurred, don't clear out the file
return Ok(());
}
report_error(
encoding::all::UTF_16LE.encode(&buffer, encoding::EncoderTrap::Strict),
reporter,
)?
}
content_inspector::ContentType::UTF_16BE => {
let buffer = report_error(String::from_utf8(buffer), reporter)?;
if buffer.is_empty() {
// Error occurred, don't clear out the file
return Ok(());
}
report_error(
encoding::all::UTF_16BE.encode(&buffer, encoding::EncoderTrap::Strict),
reporter,
)?
}
};
report_error(std::fs::write(path, buffer), reporter)?;
Ok(())
}
fn report_error<T: Default, E: ToString>(
value: Result<T, E>,
reporter: &dyn report::Report,
) -> Result<T, std::io::Error> {
let buffer = match value {
Ok(value) => value,
Err(err) => { Err(err) => {
let msg = report::Error::new(err.to_string()); let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?; reporter.report(msg.into())?;
Default::default()
} }
}; };
Ok(()) Ok(buffer)
} }
struct AccumulateLineNum { struct AccumulateLineNum {