refactor: Switch out the UTF-16 encoding impl

Fixes #702
This commit is contained in:
Ed Page 2023-03-29 20:00:39 -05:00
parent e1a138b637
commit 98be58dbc9
3 changed files with 32 additions and 72 deletions

65
Cargo.lock generated
View file

@ -578,69 +578,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
[[package]] [[package]]
name = "encoding" name = "encoding_rs"
version = "0.2.33" version = "0.8.32"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
dependencies = [ dependencies = [
"encoding-index-japanese", "cfg-if",
"encoding-index-korean",
"encoding-index-simpchinese",
"encoding-index-singlebyte",
"encoding-index-tradchinese",
] ]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]] [[package]]
name = "enumflags2" name = "enumflags2"
version = "0.7.5" version = "0.7.5"
@ -1675,7 +1620,7 @@ dependencies = [
"derive_more", "derive_more",
"derive_setters", "derive_setters",
"difflib", "difflib",
"encoding", "encoding_rs",
"env_logger", "env_logger",
"globset", "globset",
"human-panic", "human-panic",

View file

@ -70,7 +70,6 @@ derive_more = "0.99.17"
derive_setters = "0.1" derive_setters = "0.1"
itertools = "0.10" itertools = "0.10"
serde_json = "1.0" serde_json = "1.0"
encoding = "0.2"
kstring = { version = "2.0.0", features = ["serde"] } kstring = { version = "2.0.0", features = ["serde"] }
typed-arena = "2.0.2" typed-arena = "2.0.2"
maplit = "1.0" maplit = "1.0"
@ -82,6 +81,7 @@ anstyle = "0.3.5"
anstream = "0.2.6" anstream = "0.2.6"
serde_regex = "1.1.0" serde_regex = "1.1.0"
regex = "1.7.3" regex = "1.7.3"
encoding_rs = "0.8.32"
[dev-dependencies] [dev-dependencies]
assert_fs = "1.0" assert_fs = "1.0"

View file

@ -1,5 +1,4 @@
use bstr::ByteSlice; use bstr::ByteSlice;
use encoding::Encoding;
use std::io::Read; use std::io::Read;
use std::io::Write; use std::io::Write;
@ -473,11 +472,23 @@ fn read_file(
(buffer, content_type) (buffer, content_type)
}, },
content_inspector::ContentType::UTF_16LE => { content_inspector::ContentType::UTF_16LE => {
let buffer = report_result(encoding::all::UTF_16LE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?; let mut decoded = String::new();
let (r, _) = encoding_rs::UTF_16LE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16LE encoding"),
};
let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type) (buffer.into_bytes(), content_type)
} }
content_inspector::ContentType::UTF_16BE => { content_inspector::ContentType::UTF_16BE => {
let buffer = report_result(encoding::all::UTF_16BE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?; let mut decoded = String::new();
let (r, _) = encoding_rs::UTF_16BE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16BE encoding"),
};
let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type) (buffer.into_bytes(), content_type)
}, },
}; };
@ -505,10 +516,12 @@ fn write_file(
// Error occurred, don't clear out the file // Error occurred, don't clear out the file
return Ok(()); return Ok(());
} }
report_result( let (encoded, _, replaced) = encoding_rs::UTF_16LE.encode(&buffer);
encoding::all::UTF_16LE.encode(&buffer, encoding::EncoderTrap::Strict), assert!(
reporter, !replaced,
)? "Coming from UTF-8, UTF-16LE shouldn't do replacements"
);
encoded.into_owned()
} }
content_inspector::ContentType::UTF_16BE => { content_inspector::ContentType::UTF_16BE => {
let buffer = report_result(String::from_utf8(buffer), reporter)?; let buffer = report_result(String::from_utf8(buffer), reporter)?;
@ -516,10 +529,12 @@ fn write_file(
// Error occurred, don't clear out the file // Error occurred, don't clear out the file
return Ok(()); return Ok(());
} }
report_result( let (encoded, _, replaced) = encoding_rs::UTF_16BE.encode(&buffer);
encoding::all::UTF_16BE.encode(&buffer, encoding::EncoderTrap::Strict), assert!(
reporter, !replaced,
)? "Coming from UTF-8, UTF-16BE shouldn't do replacements"
);
encoded.into_owned()
} }
}; };