Merge pull request #703 from epage/enc

refactor: Switch out the UTF-16 encoding impl
This commit is contained in:
Ed Page 2023-04-03 09:50:22 -05:00 committed by GitHub
commit e10e0d20da
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 32 additions and 72 deletions

65
Cargo.lock generated
View file

@ -578,69 +578,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
[[package]]
name = "encoding"
version = "0.2.33"
name = "encoding_rs"
version = "0.8.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
dependencies = [
"encoding-index-japanese",
"encoding-index-korean",
"encoding-index-simpchinese",
"encoding-index-singlebyte",
"encoding-index-tradchinese",
"cfg-if",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]]
name = "enumflags2"
version = "0.7.5"
@ -1675,7 +1620,7 @@ dependencies = [
"derive_more",
"derive_setters",
"difflib",
"encoding",
"encoding_rs",
"env_logger",
"globset",
"human-panic",

View file

@ -70,7 +70,6 @@ derive_more = "0.99.17"
derive_setters = "0.1"
itertools = "0.10"
serde_json = "1.0"
encoding = "0.2"
kstring = { version = "2.0.0", features = ["serde"] }
typed-arena = "2.0.2"
maplit = "1.0"
@ -82,6 +81,7 @@ anstyle = "0.3.5"
anstream = "0.2.6"
serde_regex = "1.1.0"
regex = "1.7.3"
encoding_rs = "0.8.32"
[dev-dependencies]
assert_fs = "1.0"

View file

@ -1,5 +1,4 @@
use bstr::ByteSlice;
use encoding::Encoding;
use std::io::Read;
use std::io::Write;
@ -473,11 +472,23 @@ fn read_file(
(buffer, content_type)
},
content_inspector::ContentType::UTF_16LE => {
let buffer = report_result(encoding::all::UTF_16LE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
let mut decoded = String::new();
let (r, _) = encoding_rs::UTF_16LE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16LE encoding"),
};
let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type)
}
content_inspector::ContentType::UTF_16BE => {
let buffer = report_result(encoding::all::UTF_16BE.decode(&buffer, encoding::DecoderTrap::Strict), reporter)?;
let mut decoded = String::new();
let (r, _) = encoding_rs::UTF_16BE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16BE encoding"),
};
let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type)
},
};
@ -505,10 +516,12 @@ fn write_file(
// Error occurred, don't clear out the file
return Ok(());
}
report_result(
encoding::all::UTF_16LE.encode(&buffer, encoding::EncoderTrap::Strict),
reporter,
)?
let (encoded, _, replaced) = encoding_rs::UTF_16LE.encode(&buffer);
assert!(
!replaced,
"Coming from UTF-8, UTF-16LE shouldn't do replacements"
);
encoded.into_owned()
}
content_inspector::ContentType::UTF_16BE => {
let buffer = report_result(String::from_utf8(buffer), reporter)?;
@ -516,10 +529,12 @@ fn write_file(
// Error occurred, don't clear out the file
return Ok(());
}
report_result(
encoding::all::UTF_16BE.encode(&buffer, encoding::EncoderTrap::Strict),
reporter,
)?
let (encoded, _, replaced) = encoding_rs::UTF_16BE.encode(&buffer);
assert!(
!replaced,
"Coming from UTF-8, UTF-16BE shouldn't do replacements"
);
encoded.into_owned()
}
};