From 998fad4390ce95dc34d0fd1200ac758442396ebf Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 3 Nov 2020 19:52:39 -0600 Subject: [PATCH] feat: Check and replace UTF-16 files We don't have good detection for non-UTF encodings and don't have encoding support for UTF-32, so limiting it to just UTF-16. Fixes #17 --- Cargo.lock | 65 ++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + docs/about.md | 2 +- src/checks.rs | 96 +++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 138 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e624421..fd89f8d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -351,6 +351,70 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "enumflags2" version = "0.6.4" @@ -1177,6 +1241,7 @@ dependencies = [ "derive_more 0.99.11", "derive_setters", "difflib", + "encoding", "env_logger 0.8.2", "human-panic", "ignore", diff --git a/Cargo.toml b/Cargo.toml index 32a6f22..081aa7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,7 @@ derive_more = "0.99.11" derive_setters = "0.1" itertools = "0.9" serde_json = "1.0" +encoding = "0.2" [dev-dependencies] assert_fs = "1.0" diff --git a/docs/about.md b/docs/about.md index 6cda81b..4bd9d77 100644 --- a/docs/about.md +++ b/docs/about.md @@ -44,7 +44,7 @@ See also [benchmarks](../benchsuite/runs). | snake_case | Yes | No | ? | No | Yes | | Ignore Hex | Yes | No | ? | No | Yes | | C-Escapes | No ([#20][def-3]) | No | ? | No | Yes | -| Encodings | UTF-8 ([#17][def-17]) | UTF-8 | ? | Auto | Auto | +| Encodings | UTF-8 / UTF-16 | UTF-8 | ? | Auto | Auto | | Whole-project | Yes | Yes | Yes | Yes | No | | Ignores hidden | Yes | Yes | ? | Yes | No | | Respect gitignore | Yes | Yes | ? | No | No | diff --git a/src/checks.rs b/src/checks.rs index 3bfb25b..7c7b04a 100644 --- a/src/checks.rs +++ b/src/checks.rs @@ -1,4 +1,5 @@ use bstr::ByteSlice; +use encoding::Encoding; use crate::report; use typos::tokens; @@ -208,7 +209,7 @@ impl FileChecker for FixTypos { } if !fixes.is_empty() { let buffer = fix_buffer(buffer, fixes.into_iter()); - write_file(path, content_type, &buffer, reporter)?; + write_file(path, content_type, buffer, reporter)?; } } } @@ -503,22 +504,30 @@ pub fn read_file( path: &std::path::Path, reporter: &dyn report::Report, ) -> Result<(Vec, content_inspector::ContentType), std::io::Error> { - let buffer = match std::fs::read(path) { - Ok(buffer) => buffer, - Err(err) => { - let msg = report::Error::new(err.to_string()); - reporter.report(msg.into())?; - Vec::new() - } - }; + let buffer = report_error(std::fs::read(path), reporter)?; - let mut content_type = content_inspector::inspect(&buffer); - // HACK: We only support UTF-8 at the moment - if content_type != content_inspector::ContentType::UTF_8_BOM - && content_type != content_inspector::ContentType::UTF_8 - { - content_type = content_inspector::ContentType::BINARY; - } + let content_type = content_inspector::inspect(&buffer); + + let (buffer, content_type) = match content_type { + content_inspector::ContentType::BINARY | + // HACK: We don't support UTF-32 yet + content_inspector::ContentType::UTF_32LE | + content_inspector::ContentType::UTF_32BE => { + (buffer, content_inspector::ContentType::BINARY) + }, + content_inspector::ContentType::UTF_8 | + content_inspector::ContentType::UTF_8_BOM => { + (buffer, content_type) + }, + content_inspector::ContentType::UTF_16LE => { + let buffer = report_error(encoding::all::UTF_16LE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?; + (buffer.into_bytes(), content_type) + } + content_inspector::ContentType::UTF_16BE => { + let buffer = report_error(encoding::all::UTF_16BE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?; + (buffer.into_bytes(), content_type) + }, + }; Ok((buffer, content_type)) } @@ -526,22 +535,59 @@ pub fn read_file( pub fn write_file( path: &std::path::Path, content_type: content_inspector::ContentType, - buffer: &[u8], + buffer: Vec, reporter: &dyn report::Report, ) -> Result<(), std::io::Error> { - assert!( - content_type == content_inspector::ContentType::UTF_8_BOM - || content_type == content_inspector::ContentType::UTF_8 - || content_type == content_inspector::ContentType::BINARY - ); - match std::fs::write(path, buffer) { - Ok(()) => (), + let buffer = match content_type { + // HACK: We don't support UTF-32 yet + content_inspector::ContentType::UTF_32LE | content_inspector::ContentType::UTF_32BE => { + unreachable!("read_file should prevent these from being passed along"); + } + content_inspector::ContentType::BINARY + | content_inspector::ContentType::UTF_8 + | content_inspector::ContentType::UTF_8_BOM => buffer, + content_inspector::ContentType::UTF_16LE => { + let buffer = report_error(String::from_utf8(buffer), reporter)?; + if buffer.is_empty() { + // Error occurred, don't clear out the file + return Ok(()); + } + report_error( + encoding::all::UTF_16LE.encode(&buffer, encoding::EncoderTrap::Strict), + reporter, + )? + } + content_inspector::ContentType::UTF_16BE => { + let buffer = report_error(String::from_utf8(buffer), reporter)?; + if buffer.is_empty() { + // Error occurred, don't clear out the file + return Ok(()); + } + report_error( + encoding::all::UTF_16BE.encode(&buffer, encoding::EncoderTrap::Strict), + reporter, + )? + } + }; + + report_error(std::fs::write(path, buffer), reporter)?; + + Ok(()) +} + +fn report_error( + value: Result, + reporter: &dyn report::Report, +) -> Result { + let buffer = match value { + Ok(value) => value, Err(err) => { let msg = report::Error::new(err.to_string()); reporter.report(msg.into())?; + Default::default() } }; - Ok(()) + Ok(buffer) } struct AccumulateLineNum {