diff --git a/androscalpel_serializer/src/core/mod.rs b/androscalpel_serializer/src/core/mod.rs index 89e9b3c..3729230 100644 --- a/androscalpel_serializer/src/core/mod.rs +++ b/androscalpel_serializer/src/core/mod.rs @@ -6,13 +6,16 @@ use std::io::{Cursor, Read, Seek, SeekFrom, Write}; pub use androscalpel_serializer_derive::*; pub mod leb; +pub mod string; pub use leb::*; +pub use string::*; #[derive(Debug, PartialEq, Eq)] pub enum Error { InputTooSmall(String), // TODO: find a better name SerializationError(String), DeserializationError(String), + InvalidStringEncoding(String), } pub type Result = core::result::Result; @@ -26,6 +29,7 @@ impl std::fmt::Display for Error { Self::InputTooSmall(msg) => write!(f, "Error: {}", msg), Self::SerializationError(msg) => write!(f, "Error: {}", msg), Self::DeserializationError(msg) => write!(f, "Error: {}", msg), + Self::InvalidStringEncoding(msg) => write!(f, "Error: {}", msg), } } } diff --git a/androscalpel_serializer/src/core/string.rs b/androscalpel_serializer/src/core/string.rs new file mode 100644 index 0000000..a7cf187 --- /dev/null +++ b/androscalpel_serializer/src/core/string.rs @@ -0,0 +1,453 @@ +//! The string representation, encoded in MUTF-8 +//! https://source.android.com/docs/core/runtime/dex-format#mutf-8 +//! +//! The encoding of codepoint in MUTF-8 is as following (table from +//! ): +//! +//! | Number of bytes | First code point | Last code point | Bits | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Byte 6 | +//! |-----------------|------------------|-----------------|------|----------|----------|----------|----------|----------|----------| +//! | 2 | U+0000 | U+0000 | - | 11000000 | 10000000 | | | | | +//! | 1 | U+0001 | U+007F | 7 | 0xxxxxxx | | | | | | +//! | 2 | U+0080 | U+07FF | 11 | 110xxxxx | 10xxxxxx | | | | | +//! | 3 | U+0800 | U+FFFF | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | | | | +//! | 6 | U+10000 | U+FFFFF | 20 | 11101101 | 1010xxxx | 10xxxxxx | 11101101 | 1011xxxx | 10xxxxxx | + +use crate as androscalpel_serializer; +use crate::core::*; +pub use androscalpel_serializer_derive::*; + +/// [string-data-item](https://source.android.com/docs/core/runtime/dex-format#string-data-item) +#[derive(Serializable, PartialEq, Eq, Debug)] +pub struct StringDataItem { + pub utf16_size: Uleb128, + #[until(u8, u8, 0x00u8)] + pub data: Vec, +} + +const TERMINATION_BYTE: u8 = 0; +const SURROGATE_BYTE: u8 = 0b1110_1101; +const MASK_SURROGATED_BYTE_PREFIX: u8 = 0b1111_0000; +const VALUE_SURROGATED_BYTE_1_PREFIX: u8 = 0b1010_0000; +const VALUE_SURROGATED_BYTE_2_PREFIX: u8 = 0b1011_0000; +const MASK_TRAYLING_BYTE_PREFIX: u8 = 0b1100_0000; +const VALUE_TRAYLING_BYTE_PREFIX: u8 = 0b1000_0000; + +impl TryFrom<&StringDataItem> for String { + type Error = Error; + fn try_from(item: &StringDataItem) -> Result { + item.get_string() + } +} +impl TryFrom for String { + type Error = Error; + fn try_from(item: StringDataItem) -> Result { + item.get_string() + } +} + +impl From<&str> for StringDataItem { + fn from(string: &str) -> Self { + let mut ret_data = vec![]; + let mut data = vec![]; + let mut size = 0; + for chr in string.chars() { + let code_point: u32 = chr.into(); + if code_point == 0 { + data.push(0b1100_0000); + data.push(0b1000_0000); + } else if code_point <= 0x7F { + data.push(code_point as u8); + } else if code_point <= 0x7FF { + data.push(0b1100_0000 | (((code_point >> 6) & 0b0001_1111) as u8)); + data.push( + VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX), + ); + } else if code_point <= 0xFFFF { + data.push(0b1110_0000 | (((code_point >> 12) & 0b0000_1111) as u8)); + data.push( + VALUE_TRAYLING_BYTE_PREFIX + | (((code_point >> 6) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8), + ); + data.push( + VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX), + ); + } else if code_point <= 0xFFFFF { + data.push(SURROGATE_BYTE); + data.push( + VALUE_SURROGATED_BYTE_1_PREFIX + | (((code_point >> 16) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8), + ); + data.push( + VALUE_TRAYLING_BYTE_PREFIX + | (((code_point >> 10) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8), + ); + data.push(SURROGATE_BYTE); + data.push( + VALUE_SURROGATED_BYTE_2_PREFIX + | (((code_point >> 6) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8), + ); + data.push( + VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX), + ); + } else { + panic!("Code Point {code_point} for char {chr} is to big"); + } + let slice = &data[..]; + println!("0x{code_point:x} -> {slice:x?}"); + for b in data { + ret_data.push(b); + } + data = vec![]; + size += 1; + } + Self { + utf16_size: Uleb128(size), + data: ret_data, + } + } +} + +impl StringDataItem { + fn get_string(&self) -> Result { + let mut string = String::new(); + let mut i = 0; + while i < self.data.len() { + if self.data[i] == TERMINATION_BYTE { + return Err(Error::InvalidStringEncoding( + "String should not contains null bytes".into(), + )); + } + if self.data[i] == SURROGATE_BYTE { + let res = self.get_surrogate(i); + match res { + Ok(chr) => { + string.push(chr); + i += 6; + continue; + } + Err(err) if i + 3 > self.data.len() => { + return Err(err); + } + _ => (), + } + // Else, it may be a 3 bytes character... + } + let mut leading_bits = 0; + for i in 0..8 { + if (self.data[0] & (1 << (7 - i))) != 0 { + leading_bits += 1; + } else { + break; + } + } + if leading_bits == 1 || leading_bits > 3 { + let byte = self.data[i]; + return Err(Error::InvalidStringEncoding(format!( + "data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character" + ))); + } + string.push(self.get_non_surogated(i, leading_bits)?); + if leading_bits == 0 { + i += 1; + } else { + i += leading_bits; + } + } + Ok(string) + } + + fn get_surrogate(&self, i: usize) -> Result { + if i + 6 > self.data.len() { + return Err(Error::InvalidStringEncoding( + "Found surogate byte, but not enought bytes left to form a surogate pair".into(), + )); + } + if self.data[i] != SURROGATE_BYTE { + let byte = self.data[i]; + return Err(Error::InvalidStringEncoding(format!( + "data[{i}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b1)" + ))); + } + if self.data[i + 3] != SURROGATE_BYTE { + let j = i + 3; + let byte = self.data[j]; + return Err(Error::InvalidStringEncoding(format!( + "data[{j}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b4)" + ))); + } + if (self.data[i + 1] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_1_PREFIX { + let j = i + 1; + let byte = self.data[j]; + return Err(Error::InvalidStringEncoding(format!( + "data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_1_PREFIX:02x} (First surogate byte prefix, b2)" + ))); + } + if (self.data[i + 4] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_2_PREFIX { + let j = i + 4; + let byte = self.data[j]; + return Err(Error::InvalidStringEncoding(format!( + "data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_2_PREFIX:02x} (Second surogate byte prefix, b5)" + ))); + } + if (self.data[i + 2] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX { + let j = i + 2; + let byte = self.data[j]; + return Err(Error::InvalidStringEncoding(format!( + "data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailing byte prefix, b3)" + ))); + } + if (self.data[i + 5] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX { + let j = i + 5; + let byte = self.data[j]; + return Err(Error::InvalidStringEncoding(format!( + "data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailling byte prefix, b6)" + ))); + } + + let mut surogated_hight = ((self.data[i + 1] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6; + surogated_hight |= (self.data[i + 2] & !MASK_TRAYLING_BYTE_PREFIX) as u32; + let mut surogated_low = ((self.data[i + 4] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6; + surogated_low |= (self.data[i + 5] & !MASK_TRAYLING_BYTE_PREFIX) as u32; + let code_point = 0x10000 | (surogated_hight << 10) | (surogated_low); + + let slice = &self.data[i..(i + 6)]; + char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!( + "Invalide unicode code point in surrogated pair: {slice:02x?}: {code_point}" + ))) + } + + fn get_non_surogated(&self, i: usize, leading_bits: usize) -> Result { + if leading_bits == 0 { + let byte = self.data[i]; + char::from_u32(byte as u32).ok_or(Error::InvalidStringEncoding(format!( + "Invalide unicode code point: {byte}" + ))) + } else { + if i + leading_bits > self.data.len() { + return Err(Error::InvalidStringEncoding(format!( + "Found a {leading_bits} long code point at {i} but not enought bytes" + ))); + } + let mut code_point = (self.data[0] % (1 << (7 - leading_bits))) as u32; + let slice = &self.data[i..(i + leading_bits)]; + for s in 1..leading_bits { + let j = i + s; + let byte = self.data[j]; + if (byte & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX { + return Err(Error::InvalidStringEncoding(format!( + "Invalid byte {byte:02x} at {j} in {leading_bits} bytes long code point {slice:02x?}" + ))); + } + code_point <<= 6; + code_point |= (byte & !MASK_TRAYLING_BYTE_PREFIX) as u32; + } + char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!( + "Invalide unicode code point: {code_point} ({slice:02x?})" + ))) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + /// Test for bug found in https://github.com/TkTech/mutf8/tree/master: + /// https://github.com/TkTech/mutf8/blob/master/tests/test_bugs.py + #[test] + fn bug_tktech_mutf8() { + let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]"; + let encoded: StringDataItem = string_issue_1.into(); + println!("encoded:"); + for c in &encoded.data { + print!("0x{c:02x} "); + } + println!(); + let decoded: String = encoded.try_into().unwrap(); + assert_eq!(&decoded, string_issue_1); + /* + let string_issue_3 = "黑人抬棺組裝包"; + let encoded: StringDataItem = string_issue_2.into(); + let decoded: String = encoded.try_into().unwrap(); + println!("encoded:"); + for c in decoded.chars() { + let val: u32 = c.into(); + print!("0x{val:08x} "); + } + println!(); + for c in string_issue_2.chars() { + let val: u32 = c.into(); + print!("0x{val:08x} "); + } + println!(); + assert_eq!(&decoded, string_issue_2); + */ + } + + /// Test from https://github.com/TkTech/mutf8/tree/master, test for bad encoding + /// Test for bug found in https://github.com/TkTech/mutf8/tree/master: + #[test] + fn test_tktech_bad_mutf8() { + assert!(TryInto::::try_into(StringDataItem { + utf16_size: Uleb128(0), + data: vec![0x00] + }) + .is_err()); + assert!(TryInto::::try_into(StringDataItem { + utf16_size: Uleb128(0), + data: vec![0xC2] + }) + .is_err()); + assert!(TryInto::::try_into(StringDataItem { + utf16_size: Uleb128(0), + data: vec![0xED] + }) + .is_err()); + assert!(TryInto::::try_into(StringDataItem { + utf16_size: Uleb128(0), + data: vec![0xE2] + }) + .is_err()); + } + + /// Test from https://github.com/TkTech/mutf8/tree/master, test 2 bytes + /// Test for bug found in https://github.com/TkTech/mutf8/tree/master: + #[test] + fn test_tktech_2_bytes_mutf8() { + let tests = vec![ + (0x0080, vec![0xc2, 0x80]), + (0x0081, vec![0xc2, 0x81]), + (0x0082, vec![0xc2, 0x82]), + (0x0084, vec![0xc2, 0x84]), + (0x0088, vec![0xc2, 0x88]), + (0x0090, vec![0xc2, 0x90]), + (0x00a0, vec![0xc2, 0xa0]), + (0x00c0, vec![0xc3, 0x80]), + (0x0180, vec![0xc6, 0x80]), + (0x0280, vec![0xca, 0x80]), + (0x0480, vec![0xd2, 0x80]), + (0x0481, vec![0xd2, 0x81]), + (0x0483, vec![0xd2, 0x83]), + (0x0487, vec![0xd2, 0x87]), + (0x048f, vec![0xd2, 0x8f]), + (0x049f, vec![0xd2, 0x9f]), + (0x04af, vec![0xd2, 0xaf]), + (0x04bf, vec![0xd2, 0xbf]), + (0x04ff, vec![0xd3, 0xbf]), + (0x05ff, vec![0xd7, 0xbf]), + (0x05ff, vec![0xd7, 0xbf]), + (0x07ff, vec![0xdf, 0xbf]), + ]; + for (code_point, data) in tests { + let encoded = StringDataItem { + utf16_size: Uleb128(1), + data, + }; + let expected = char::from_u32(code_point).unwrap().to_string(); + assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); + assert_eq!(encoded, expected.as_str().into()); + } + } + + /// Test from https://github.com/TkTech/mutf8/tree/master, test 3 bytes + /// Test for bug found in https://github.com/TkTech/mutf8/tree/master: + #[test] + fn test_tktech_3_bytes_mutf8() { + let tests = vec![ + (0x0800, vec![0xe0, 0xa0, 0x80]), + (0x0801, vec![0xe0, 0xa0, 0x81]), + (0x0802, vec![0xe0, 0xa0, 0x82]), + (0x0804, vec![0xe0, 0xa0, 0x84]), + (0x0808, vec![0xe0, 0xa0, 0x88]), + (0x0810, vec![0xe0, 0xa0, 0x90]), + (0x0820, vec![0xe0, 0xa0, 0xa0]), + (0x0840, vec![0xe0, 0xa1, 0x80]), + (0x0880, vec![0xe0, 0xa2, 0x80]), + (0x0900, vec![0xe0, 0xa4, 0x80]), + (0x0a00, vec![0xe0, 0xa8, 0x80]), + (0x0c00, vec![0xe0, 0xb0, 0x80]), + (0x1800, vec![0xe1, 0xa0, 0x80]), + (0x2800, vec![0xe2, 0xa0, 0x80]), + (0x4800, vec![0xe4, 0xa0, 0x80]), + (0x8800, vec![0xe8, 0xa0, 0x80]), + (0x8801, vec![0xe8, 0xa0, 0x81]), + (0x8803, vec![0xe8, 0xa0, 0x83]), + (0x8807, vec![0xe8, 0xa0, 0x87]), + (0x880f, vec![0xe8, 0xa0, 0x8f]), + (0x881f, vec![0xe8, 0xa0, 0x9f]), + (0x883f, vec![0xe8, 0xa0, 0xbf]), + (0x887f, vec![0xe8, 0xa1, 0xbf]), + (0x88ff, vec![0xe8, 0xa3, 0xbf]), + (0x89ff, vec![0xe8, 0xa7, 0xbf]), + (0x8bff, vec![0xe8, 0xaf, 0xbf]), + (0x8fff, vec![0xe8, 0xbf, 0xbf]), + (0x9fff, vec![0xe9, 0xbf, 0xbf]), + (0xbfff, vec![0xeb, 0xbf, 0xbf]), + (0xffff, vec![0xef, 0xbf, 0xbf]), + ]; + for (code_point, data) in tests { + let encoded = StringDataItem { + utf16_size: Uleb128(1), + data, + }; + let expected = char::from_u32(code_point).unwrap().to_string(); + assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); + assert_eq!(encoded, expected.as_str().into()); + } + } + + /// Test from https://github.com/TkTech/mutf8/tree/master, test 6 bytes + /// Test for bug found in https://github.com/TkTech/mutf8/tree/master: + #[test] + fn test_tktech_6_bytes_mutf8() { + let tests = vec![ + (0x10000, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80]), + (0x10001, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81]), + (0x10002, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82]), + (0x10004, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84]), + (0x10008, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88]), + (0x10010, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90]), + (0x10020, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0]), + (0x10040, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80]), + (0x10080, vec![0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80]), + (0x10100, vec![0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80]), + (0x10200, vec![0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80]), + (0x10400, vec![0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80]), + (0x10800, vec![0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80]), + (0x11000, vec![0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80]), + (0x12000, vec![0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80]), + (0x14000, vec![0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80]), + (0x18000, vec![0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80]), + (0x30000, vec![0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80]), + (0x50000, vec![0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80]), + (0x90000, vec![0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80]), + (0x10003, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83]), + (0x10007, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87]), + (0x1000f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f]), + (0x1001f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f]), + (0x1003f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf]), + (0x1007f, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf]), + (0x100ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf]), + (0x101ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf]), + (0x103ff, vec![0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf]), + (0x107ff, vec![0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf]), + (0x10fff, vec![0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf]), + (0x11fff, vec![0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf]), + (0x13fff, vec![0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf]), + (0x17fff, vec![0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf]), + (0x1ffff, vec![0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf]), + (0x3ffff, vec![0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf]), + (0x7ffff, vec![0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf]), + (0xfffff, vec![0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf]), + ]; + for (code_point, data) in tests { + let encoded = StringDataItem { + utf16_size: Uleb128(1), + data, + }; + let expected = char::from_u32(code_point).unwrap().to_string(); + assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); + assert_eq!(encoded, expected.as_str().into()); + } + } +} diff --git a/androscalpel_serializer_derive/src/lib.rs b/androscalpel_serializer_derive/src/lib.rs index bab4b98..26ea1a4 100644 --- a/androscalpel_serializer_derive/src/lib.rs +++ b/androscalpel_serializer_derive/src/lib.rs @@ -78,12 +78,12 @@ pub fn derive_serializable(input: proc_macro::TokenStream) -> proc_macro::TokenS let implem_size = get_implem_size(&input.data, ¶ms); let expanded = quote! { impl androscalpel_serializer::Serializable for #name { - #[allow(clippy::single_element_loop)] + #[allow(clippy::single_element_loop, clippy::let_and_return)] fn serialize(&self, output: &mut dyn std::io::Write) -> androscalpel_serializer::Result<()> { #implem_serialize } - #[allow(clippy::single_element_loop)] + #[allow(clippy::single_element_loop, clippy::let_and_return)] fn deserialize(input: &mut dyn androscalpel_serializer::ReadSeek) -> androscalpel_serializer::Result { #implem_deserialize }