Merge pull request #706 from epage/utf16

fix(cli): Actually decode UTF-16
This commit is contained in:
Ed Page 2023-04-04 09:20:13 -05:00 committed by GitHub
commit 37d90c230e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -472,21 +472,27 @@ fn read_file(
(buffer, content_type) (buffer, content_type)
}, },
content_inspector::ContentType::UTF_16LE => { content_inspector::ContentType::UTF_16LE => {
let mut decoded = String::new(); // Despite accepting a `String`, decode_to_string_without_replacement` doesn't allocate
let (r, _) = encoding_rs::UTF_16LE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true); // so to avoid `OutputFull` loops, we're going to assume any UTF-16 content can fit in
// a buffer twice its size
let mut decoded = String::with_capacity(buffer.len() * 2);
let (r, written) = encoding_rs::UTF_16LE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r { let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded), encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16LE encoding"), _ => Err(format!("invalid UTF-16LE encoding at byte {} in {}", written, path.display())),
}; };
let buffer = report_result(decoded, reporter)?; let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type) (buffer.into_bytes(), content_type)
} }
content_inspector::ContentType::UTF_16BE => { content_inspector::ContentType::UTF_16BE => {
let mut decoded = String::new(); // Despite accepting a `String`, decode_to_string_without_replacement` doesn't allocate
let (r, _) = encoding_rs::UTF_16BE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true); // so to avoid `OutputFull` loops, we're going to assume any UTF-16 content can fit in
// a buffer twice its size
let mut decoded = String::with_capacity(buffer.len() * 2);
let (r, written) = encoding_rs::UTF_16BE.new_decoder_with_bom_removal().decode_to_string_without_replacement(&buffer, &mut decoded, true);
let decoded = match r { let decoded = match r {
encoding_rs::DecoderResult::InputEmpty => Ok(decoded), encoding_rs::DecoderResult::InputEmpty => Ok(decoded),
_ => Err("invalid UTF-16BE encoding"), _ => Err(format!("invalid UTF-16BE encoding at byte {} in {}", written, path.display())),
}; };
let buffer = report_result(decoded, reporter)?; let buffer = report_result(decoded, reporter)?;
(buffer.into_bytes(), content_type) (buffer.into_bytes(), content_type)