Merge pull request #706 from epage/utf16

fix(cli): Actually decode UTF-16
This commit is contained in:
Ed Page 2023-04-04 09:20:13 -05:00 committed by GitHub
commit 37d90c230e
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 4AEE18F83AFDEB23

View file

@ -472,21 +472,27 @@ fn read_file(
(buffer, content_type)
},
content_inspector::ContentType::UTF_16LE => {
let mut decoded = String::new();
let (r, _) = encoding_rs::UTF_16LE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
// Despite accepting a `String`, decode_to_string_without_replacement` doesn't allocate
// so to avoid `OutputFull` loops, we're going to assume any UTF-16 content can fit in
// a buffer twice its size
let mut decoded = String::with_capacity(buffer.len() * 2);
let (r, written) = encoding_rs::UTF_16LE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16LE encoding"),
_ => Err(format!("invalid UTF-16LE encoding at byte {} in {}", written, path.display())),
};
let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type)
}
content_inspector::ContentType::UTF_16BE => {
let mut decoded = String::new();
let (r, _) = encoding_rs::UTF_16BE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
// Despite accepting a `String`, decode_to_string_without_replacement` doesn't allocate
// so to avoid `OutputFull` loops, we're going to assume any UTF-16 content can fit in
// a buffer twice its size
let mut decoded = String::with_capacity(buffer.len() * 2);
let (r, written) = encoding_rs::UTF_16BE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16BE encoding"),
_ => Err(format!("invalid UTF-16BE encoding at byte {} in {}", written, path.display())),
};
let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type)