WIP mutf8

This commit is contained in:
Jean-Marie Mineau 2023-08-23 16:36:49 +02:00
parent d44e2b624b
commit 24f4b0b46d
Signed by: histausse
GPG key ID: B66AEEDA9B645AD2
3 changed files with 459 additions and 2 deletions

View file

@ -6,13 +6,16 @@ use std::io::{Cursor, Read, Seek, SeekFrom, Write};
pub use androscalpel_serializer_derive::*; pub use androscalpel_serializer_derive::*;
pub mod leb; pub mod leb;
pub mod string;
pub use leb::*; pub use leb::*;
pub use string::*;
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub enum Error { pub enum Error {
InputTooSmall(String), // TODO: find a better name InputTooSmall(String), // TODO: find a better name
SerializationError(String), SerializationError(String),
DeserializationError(String), DeserializationError(String),
InvalidStringEncoding(String),
} }
pub type Result<T> = core::result::Result<T, Error>; pub type Result<T> = core::result::Result<T, Error>;
@ -26,6 +29,7 @@ impl std::fmt::Display for Error {
Self::InputTooSmall(msg) => write!(f, "Error: {}", msg), Self::InputTooSmall(msg) => write!(f, "Error: {}", msg),
Self::SerializationError(msg) => write!(f, "Error: {}", msg), Self::SerializationError(msg) => write!(f, "Error: {}", msg),
Self::DeserializationError(msg) => write!(f, "Error: {}", msg), Self::DeserializationError(msg) => write!(f, "Error: {}", msg),
Self::InvalidStringEncoding(msg) => write!(f, "Error: {}", msg),
} }
} }
} }

View file

@ -0,0 +1,453 @@
//! The string representation, encoded in MUTF-8
//! https://source.android.com/docs/core/runtime/dex-format#mutf-8
//!
//! The encoding of codepoint in MUTF-8 is as following (table from
//! <https://py2jdbc.readthedocs.io/en/latest/mutf8.html>):
//!
//! | Number of bytes | First code point | Last code point | Bits | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Byte 6 |
//! |-----------------|------------------|-----------------|------|----------|----------|----------|----------|----------|----------|
//! | 2 | U+0000 | U+0000 | - | 11000000 | 10000000 | | | | |
//! | 1 | U+0001 | U+007F | 7 | 0xxxxxxx | | | | | |
//! | 2 | U+0080 | U+07FF | 11 | 110xxxxx | 10xxxxxx | | | | |
//! | 3 | U+0800 | U+FFFF | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | | | |
//! | 6 | U+10000 | U+FFFFF | 20 | 11101101 | 1010xxxx | 10xxxxxx | 11101101 | 1011xxxx | 10xxxxxx |
use crate as androscalpel_serializer;
use crate::core::*;
pub use androscalpel_serializer_derive::*;
/// [string-data-item](https://source.android.com/docs/core/runtime/dex-format#string-data-item)
#[derive(Serializable, PartialEq, Eq, Debug)]
pub struct StringDataItem {
pub utf16_size: Uleb128,
#[until(u8, u8, 0x00u8)]
pub data: Vec<u8>,
}
const TERMINATION_BYTE: u8 = 0;
const SURROGATE_BYTE: u8 = 0b1110_1101;
const MASK_SURROGATED_BYTE_PREFIX: u8 = 0b1111_0000;
const VALUE_SURROGATED_BYTE_1_PREFIX: u8 = 0b1010_0000;
const VALUE_SURROGATED_BYTE_2_PREFIX: u8 = 0b1011_0000;
const MASK_TRAYLING_BYTE_PREFIX: u8 = 0b1100_0000;
const VALUE_TRAYLING_BYTE_PREFIX: u8 = 0b1000_0000;
impl TryFrom<&StringDataItem> for String {
type Error = Error;
fn try_from(item: &StringDataItem) -> Result<String> {
item.get_string()
}
}
impl TryFrom<StringDataItem> for String {
type Error = Error;
fn try_from(item: StringDataItem) -> Result<String> {
item.get_string()
}
}
impl From<&str> for StringDataItem {
fn from(string: &str) -> Self {
let mut ret_data = vec![];
let mut data = vec![];
let mut size = 0;
for chr in string.chars() {
let code_point: u32 = chr.into();
if code_point == 0 {
data.push(0b1100_0000);
data.push(0b1000_0000);
} else if code_point <= 0x7F {
data.push(code_point as u8);
} else if code_point <= 0x7FF {
data.push(0b1100_0000 | (((code_point >> 6) & 0b0001_1111) as u8));
data.push(
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
);
} else if code_point <= 0xFFFF {
data.push(0b1110_0000 | (((code_point >> 12) & 0b0000_1111) as u8));
data.push(
VALUE_TRAYLING_BYTE_PREFIX
| (((code_point >> 6) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
);
data.push(
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
);
} else if code_point <= 0xFFFFF {
data.push(SURROGATE_BYTE);
data.push(
VALUE_SURROGATED_BYTE_1_PREFIX
| (((code_point >> 16) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
);
data.push(
VALUE_TRAYLING_BYTE_PREFIX
| (((code_point >> 10) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
);
data.push(SURROGATE_BYTE);
data.push(
VALUE_SURROGATED_BYTE_2_PREFIX
| (((code_point >> 6) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
);
data.push(
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
);
} else {
panic!("Code Point {code_point} for char {chr} is to big");
}
let slice = &data[..];
println!("0x{code_point:x} -> {slice:x?}");
for b in data {
ret_data.push(b);
}
data = vec![];
size += 1;
}
Self {
utf16_size: Uleb128(size),
data: ret_data,
}
}
}
impl StringDataItem {
fn get_string(&self) -> Result<String> {
let mut string = String::new();
let mut i = 0;
while i < self.data.len() {
if self.data[i] == TERMINATION_BYTE {
return Err(Error::InvalidStringEncoding(
"String should not contains null bytes".into(),
));
}
if self.data[i] == SURROGATE_BYTE {
let res = self.get_surrogate(i);
match res {
Ok(chr) => {
string.push(chr);
i += 6;
continue;
}
Err(err) if i + 3 > self.data.len() => {
return Err(err);
}
_ => (),
}
// Else, it may be a 3 bytes character...
}
let mut leading_bits = 0;
for i in 0..8 {
if (self.data[0] & (1 << (7 - i))) != 0 {
leading_bits += 1;
} else {
break;
}
}
if leading_bits == 1 || leading_bits > 3 {
let byte = self.data[i];
return Err(Error::InvalidStringEncoding(format!(
"data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character"
)));
}
string.push(self.get_non_surogated(i, leading_bits)?);
if leading_bits == 0 {
i += 1;
} else {
i += leading_bits;
}
}
Ok(string)
}
fn get_surrogate(&self, i: usize) -> Result<char> {
if i + 6 > self.data.len() {
return Err(Error::InvalidStringEncoding(
"Found surogate byte, but not enought bytes left to form a surogate pair".into(),
));
}
if self.data[i] != SURROGATE_BYTE {
let byte = self.data[i];
return Err(Error::InvalidStringEncoding(format!(
"data[{i}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b1)"
)));
}
if self.data[i + 3] != SURROGATE_BYTE {
let j = i + 3;
let byte = self.data[j];
return Err(Error::InvalidStringEncoding(format!(
"data[{j}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b4)"
)));
}
if (self.data[i + 1] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_1_PREFIX {
let j = i + 1;
let byte = self.data[j];
return Err(Error::InvalidStringEncoding(format!(
"data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_1_PREFIX:02x} (First surogate byte prefix, b2)"
)));
}
if (self.data[i + 4] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_2_PREFIX {
let j = i + 4;
let byte = self.data[j];
return Err(Error::InvalidStringEncoding(format!(
"data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_2_PREFIX:02x} (Second surogate byte prefix, b5)"
)));
}
if (self.data[i + 2] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
let j = i + 2;
let byte = self.data[j];
return Err(Error::InvalidStringEncoding(format!(
"data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailing byte prefix, b3)"
)));
}
if (self.data[i + 5] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
let j = i + 5;
let byte = self.data[j];
return Err(Error::InvalidStringEncoding(format!(
"data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailling byte prefix, b6)"
)));
}
let mut surogated_hight = ((self.data[i + 1] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
surogated_hight |= (self.data[i + 2] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
let mut surogated_low = ((self.data[i + 4] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
surogated_low |= (self.data[i + 5] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
let code_point = 0x10000 | (surogated_hight << 10) | (surogated_low);
let slice = &self.data[i..(i + 6)];
char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
"Invalide unicode code point in surrogated pair: {slice:02x?}: {code_point}"
)))
}
fn get_non_surogated(&self, i: usize, leading_bits: usize) -> Result<char> {
if leading_bits == 0 {
let byte = self.data[i];
char::from_u32(byte as u32).ok_or(Error::InvalidStringEncoding(format!(
"Invalide unicode code point: {byte}"
)))
} else {
if i + leading_bits > self.data.len() {
return Err(Error::InvalidStringEncoding(format!(
"Found a {leading_bits} long code point at {i} but not enought bytes"
)));
}
let mut code_point = (self.data[0] % (1 << (7 - leading_bits))) as u32;
let slice = &self.data[i..(i + leading_bits)];
for s in 1..leading_bits {
let j = i + s;
let byte = self.data[j];
if (byte & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
return Err(Error::InvalidStringEncoding(format!(
"Invalid byte {byte:02x} at {j} in {leading_bits} bytes long code point {slice:02x?}"
)));
}
code_point <<= 6;
code_point |= (byte & !MASK_TRAYLING_BYTE_PREFIX) as u32;
}
char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
"Invalide unicode code point: {code_point} ({slice:02x?})"
)))
}
}
}
#[cfg(test)]
mod test {
use super::*;
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
/// https://github.com/TkTech/mutf8/blob/master/tests/test_bugs.py
#[test]
fn bug_tktech_mutf8() {
let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]";
let encoded: StringDataItem = string_issue_1.into();
println!("encoded:");
for c in &encoded.data {
print!("0x{c:02x} ");
}
println!();
let decoded: String = encoded.try_into().unwrap();
assert_eq!(&decoded, string_issue_1);
/*
let string_issue_3 = "黑人抬棺組裝包";
let encoded: StringDataItem = string_issue_2.into();
let decoded: String = encoded.try_into().unwrap();
println!("encoded:");
for c in decoded.chars() {
let val: u32 = c.into();
print!("0x{val:08x} ");
}
println!();
for c in string_issue_2.chars() {
let val: u32 = c.into();
print!("0x{val:08x} ");
}
println!();
assert_eq!(&decoded, string_issue_2);
*/
}
/// Test from https://github.com/TkTech/mutf8/tree/master, test for bad encoding
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
#[test]
fn test_tktech_bad_mutf8() {
assert!(TryInto::<String>::try_into(StringDataItem {
utf16_size: Uleb128(0),
data: vec![0x00]
})
.is_err());
assert!(TryInto::<String>::try_into(StringDataItem {
utf16_size: Uleb128(0),
data: vec![0xC2]
})
.is_err());
assert!(TryInto::<String>::try_into(StringDataItem {
utf16_size: Uleb128(0),
data: vec![0xED]
})
.is_err());
assert!(TryInto::<String>::try_into(StringDataItem {
utf16_size: Uleb128(0),
data: vec![0xE2]
})
.is_err());
}
/// Test from https://github.com/TkTech/mutf8/tree/master, test 2 bytes
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
#[test]
fn test_tktech_2_bytes_mutf8() {
let tests = vec![
(0x0080, vec![0xc2, 0x80]),
(0x0081, vec![0xc2, 0x81]),
(0x0082, vec![0xc2, 0x82]),
(0x0084, vec![0xc2, 0x84]),
(0x0088, vec![0xc2, 0x88]),
(0x0090, vec![0xc2, 0x90]),
(0x00a0, vec![0xc2, 0xa0]),
(0x00c0, vec![0xc3, 0x80]),
(0x0180, vec![0xc6, 0x80]),
(0x0280, vec![0xca, 0x80]),
(0x0480, vec![0xd2, 0x80]),
(0x0481, vec![0xd2, 0x81]),
(0x0483, vec![0xd2, 0x83]),
(0x0487, vec![0xd2, 0x87]),
(0x048f, vec![0xd2, 0x8f]),
(0x049f, vec![0xd2, 0x9f]),
(0x04af, vec![0xd2, 0xaf]),
(0x04bf, vec![0xd2, 0xbf]),
(0x04ff, vec![0xd3, 0xbf]),
(0x05ff, vec![0xd7, 0xbf]),
(0x05ff, vec![0xd7, 0xbf]),
(0x07ff, vec![0xdf, 0xbf]),
];
for (code_point, data) in tests {
let encoded = StringDataItem {
utf16_size: Uleb128(1),
data,
};
let expected = char::from_u32(code_point).unwrap().to_string();
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
assert_eq!(encoded, expected.as_str().into());
}
}
/// Test from https://github.com/TkTech/mutf8/tree/master, test 3 bytes
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
#[test]
fn test_tktech_3_bytes_mutf8() {
let tests = vec![
(0x0800, vec![0xe0, 0xa0, 0x80]),
(0x0801, vec![0xe0, 0xa0, 0x81]),
(0x0802, vec![0xe0, 0xa0, 0x82]),
(0x0804, vec![0xe0, 0xa0, 0x84]),
(0x0808, vec![0xe0, 0xa0, 0x88]),
(0x0810, vec![0xe0, 0xa0, 0x90]),
(0x0820, vec![0xe0, 0xa0, 0xa0]),
(0x0840, vec![0xe0, 0xa1, 0x80]),
(0x0880, vec![0xe0, 0xa2, 0x80]),
(0x0900, vec![0xe0, 0xa4, 0x80]),
(0x0a00, vec![0xe0, 0xa8, 0x80]),
(0x0c00, vec![0xe0, 0xb0, 0x80]),
(0x1800, vec![0xe1, 0xa0, 0x80]),
(0x2800, vec![0xe2, 0xa0, 0x80]),
(0x4800, vec![0xe4, 0xa0, 0x80]),
(0x8800, vec![0xe8, 0xa0, 0x80]),
(0x8801, vec![0xe8, 0xa0, 0x81]),
(0x8803, vec![0xe8, 0xa0, 0x83]),
(0x8807, vec![0xe8, 0xa0, 0x87]),
(0x880f, vec![0xe8, 0xa0, 0x8f]),
(0x881f, vec![0xe8, 0xa0, 0x9f]),
(0x883f, vec![0xe8, 0xa0, 0xbf]),
(0x887f, vec![0xe8, 0xa1, 0xbf]),
(0x88ff, vec![0xe8, 0xa3, 0xbf]),
(0x89ff, vec![0xe8, 0xa7, 0xbf]),
(0x8bff, vec![0xe8, 0xaf, 0xbf]),
(0x8fff, vec![0xe8, 0xbf, 0xbf]),
(0x9fff, vec![0xe9, 0xbf, 0xbf]),
(0xbfff, vec![0xeb, 0xbf, 0xbf]),
(0xffff, vec![0xef, 0xbf, 0xbf]),
];
for (code_point, data) in tests {
let encoded = StringDataItem {
utf16_size: Uleb128(1),
data,
};
let expected = char::from_u32(code_point).unwrap().to_string();
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
assert_eq!(encoded, expected.as_str().into());
}
}
/// Test from https://github.com/TkTech/mutf8/tree/master, test 6 bytes
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
#[test]
fn test_tktech_6_bytes_mutf8() {
let tests = vec![
(0x10000, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80]),
(0x10001, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81]),
(0x10002, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82]),
(0x10004, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84]),
(0x10008, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88]),
(0x10010, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90]),
(0x10020, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0]),
(0x10040, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80]),
(0x10080, vec![0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80]),
(0x10100, vec![0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80]),
(0x10200, vec![0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80]),
(0x10400, vec![0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80]),
(0x10800, vec![0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80]),
(0x11000, vec![0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80]),
(0x12000, vec![0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80]),
(0x14000, vec![0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80]),
(0x18000, vec![0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80]),
(0x30000, vec![0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80]),
(0x50000, vec![0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80]),
(0x90000, vec![0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80]),
(0x10003, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83]),
(0x10007, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87]),
(0x1000f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f]),
(0x1001f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f]),
(0x1003f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf]),
(0x1007f, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf]),
(0x100ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf]),
(0x101ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf]),
(0x103ff, vec![0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf]),
(0x107ff, vec![0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf]),
(0x10fff, vec![0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf]),
(0x11fff, vec![0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf]),
(0x13fff, vec![0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf]),
(0x17fff, vec![0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf]),
(0x1ffff, vec![0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf]),
(0x3ffff, vec![0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf]),
(0x7ffff, vec![0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf]),
(0xfffff, vec![0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf]),
];
for (code_point, data) in tests {
let encoded = StringDataItem {
utf16_size: Uleb128(1),
data,
};
let expected = char::from_u32(code_point).unwrap().to_string();
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
assert_eq!(encoded, expected.as_str().into());
}
}
}

View file

@ -78,12 +78,12 @@ pub fn derive_serializable(input: proc_macro::TokenStream) -> proc_macro::TokenS
let implem_size = get_implem_size(&input.data, &params); let implem_size = get_implem_size(&input.data, &params);
let expanded = quote! { let expanded = quote! {
impl androscalpel_serializer::Serializable for #name { impl androscalpel_serializer::Serializable for #name {
#[allow(clippy::single_element_loop)] #[allow(clippy::single_element_loop, clippy::let_and_return)]
fn serialize(&self, output: &mut dyn std::io::Write) -> androscalpel_serializer::Result<()> { fn serialize(&self, output: &mut dyn std::io::Write) -> androscalpel_serializer::Result<()> {
#implem_serialize #implem_serialize
} }
#[allow(clippy::single_element_loop)] #[allow(clippy::single_element_loop, clippy::let_and_return)]
fn deserialize(input: &mut dyn androscalpel_serializer::ReadSeek) -> androscalpel_serializer::Result<Self> { fn deserialize(input: &mut dyn androscalpel_serializer::ReadSeek) -> androscalpel_serializer::Result<Self> {
#implem_deserialize #implem_deserialize
} }