//! The string representation, encoded in MUTF-8 //! //! //! The encoding of codepoint in MUTF-8 is as following (table from //! ): //! //! | Number of bytes | First code point | Last code point | Bits | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Byte 6 | //! |-----------------|------------------|-----------------|------|----------|----------|----------|----------|----------|----------| //! | 2 | U+0000 | U+0000 | - | 11000000 | 10000000 | | | | | //! | 1 | U+0001 | U+007F | 7 | 0xxxxxxx | | | | | | //! | 2 | U+0080 | U+07FF | 11 | 110xxxxx | 10xxxxxx | | | | | //! | 3 | U+0800 | U+FFFF | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | | | | //! | 6 | U+10000 | U+FFFFF | 20 | 11101101 | 1010xxxx | 10xxxxxx | 11101101 | 1011xxxx | 10xxxxxx | use std::cmp::{Ord, Ordering, PartialOrd}; use crate as androscalpel_serializer; use crate::core::*; pub use androscalpel_serializer_derive::*; /// #[derive(Serializable, Clone, PartialEq, Eq, Debug)] pub struct StringDataItem { pub utf16_size: Uleb128, #[until(u8, u8, 0x00u8)] pub data: Vec, } const TERMINATION_BYTE: u8 = 0; const SURROGATE_BYTE: u8 = 0b1110_1101; const MASK_SURROGATED_BYTE_PREFIX: u8 = 0b1111_0000; const VALUE_SURROGATED_BYTE_1_PREFIX: u8 = 0b1010_0000; const VALUE_SURROGATED_BYTE_2_PREFIX: u8 = 0b1011_0000; const MASK_TRAYLING_BYTE_PREFIX: u8 = 0b1100_0000; const VALUE_TRAYLING_BYTE_PREFIX: u8 = 0b1000_0000; impl Ord for StringDataItem { fn cmp(&self, other: &Self) -> Ordering { self.get_aosp_utf16() .unwrap() .cmp(&other.get_aosp_utf16().unwrap()) .then(self.utf16_size.cmp(&other.utf16_size)) } } impl PartialOrd for StringDataItem { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl TryFrom<&StringDataItem> for String { type Error = Error; fn try_from(item: &StringDataItem) -> Result { item.get_string() } } impl TryFrom for String { type Error = Error; fn try_from(item: StringDataItem) -> Result { item.get_string() } } impl From<&str> for StringDataItem { fn from(string: &str) -> Self { let mut ret_data = vec![]; let mut data = vec![]; let mut size = 0; for chr in string.chars() { let code_point: u32 = chr.into(); if code_point == 0 { data.push(0b1100_0000); data.push(0b1000_0000); } else if code_point <= 0x7F { data.push(code_point as u8); } else if code_point <= 0x7FF { data.push(0b1100_0000 | (((code_point >> 6) & 0b0001_1111) as u8)); data.push( VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX), ); } else if code_point <= 0xFFFF { data.push(0b1110_0000 | (((code_point >> 12) & 0b0000_1111) as u8)); data.push( VALUE_TRAYLING_BYTE_PREFIX | (((code_point >> 6) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8), ); data.push( VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX), ); } else if code_point <= 0xFFFFF { data.push(SURROGATE_BYTE); data.push( VALUE_SURROGATED_BYTE_1_PREFIX | (((code_point >> 16) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8), ); data.push( VALUE_TRAYLING_BYTE_PREFIX | (((code_point >> 10) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8), ); data.push(SURROGATE_BYTE); data.push( VALUE_SURROGATED_BYTE_2_PREFIX | (((code_point >> 6) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8), ); data.push( VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX), ); } else { panic!("Code Point {code_point} for char {chr} is to big"); } for b in data { ret_data.push(b); } data = vec![]; size += 1; } Self { utf16_size: Uleb128(size), data: ret_data, } } } impl StringDataItem { /// Return the utf-16 string used by google in the aosp for comparaison & other. fn get_aosp_utf16(&self) -> Result> { let mut utf16_string = vec![]; let mut i = 0; while i < self.data.len() { let one = self.data[i] as u32; i += 1; if one & 0x80 == 0 { utf16_string.push(one); continue; } if i >= self.data.len() { return Err(Error::InvalidStringEncoding( "String contains invalid caracters".into(), )); } let two = self.data[i] as u32; i += 1; if one & 0x20 == 0 { utf16_string.push((one & 0x1f) << 6 | (two & 0x3f)); continue; } if i >= self.data.len() { return Err(Error::InvalidStringEncoding( "String contains invalid caracters".into(), )); } let three = self.data[i] as u32; i += 1; if one & 0x10 == 0 { utf16_string.push((one & 0x0f) << 12 | (two & 0x3f) << 6 | (three & 0x3f)); continue; } if i >= self.data.len() { return Err(Error::InvalidStringEncoding( "String contains invalid caracters".into(), )); } let four = self.data[i] as u32; i += 1; let code_point = (one & 0x0f) << 18 | (two & 0x3f) << 12 | (three & 0x3f) << 6 | (four & 0x3f); let mut pair = ((code_point >> 10) + 0xd7c0) & 0xffff; pair |= ((code_point & 0x03ff) + 0xdc00) << 16; utf16_string.push(pair); } Ok(utf16_string) } fn get_string(&self) -> Result { let mut string = String::new(); let mut i = 0; while i < self.data.len() { if self.data[i] == TERMINATION_BYTE { return Err(Error::InvalidStringEncoding( "String should not contains null bytes".into(), )); } if self.data[i] == SURROGATE_BYTE { let res = self.get_surrogate(i); match res { Ok(chr) => { string.push(chr); i += 6; continue; } Err(err) if i + 3 > self.data.len() => { return Err(err); } Err(_) => (), } // Else, it may be a 3 bytes character... } let mut leading_bits = 0; for j in 0..8 { if (self.data[i] & (1 << (7 - j))) != 0 { leading_bits += 1; } else { break; } } if leading_bits == 1 || leading_bits > 3 { let byte = self.data[i]; return Err(Error::InvalidStringEncoding(format!( "data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character" ))); } string.push(self.get_non_surogated(i, leading_bits)?); if leading_bits == 0 { i += 1; } else { i += leading_bits; } } Ok(string) } fn get_surrogate(&self, i: usize) -> Result { if i + 6 > self.data.len() { return Err(Error::InvalidStringEncoding( "Found surogate byte, but not enought bytes left to form a surogate pair".into(), )); } if self.data[i] != SURROGATE_BYTE { let byte = self.data[i]; return Err(Error::InvalidStringEncoding(format!( "data[{i}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b1)" ))); } if self.data[i + 3] != SURROGATE_BYTE { let j = i + 3; let byte = self.data[j]; return Err(Error::InvalidStringEncoding(format!( "data[{j}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b4)" ))); } if (self.data[i + 1] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_1_PREFIX { let j = i + 1; let byte = self.data[j]; return Err(Error::InvalidStringEncoding(format!( "data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_1_PREFIX:02x} (First surogate byte prefix, b2)" ))); } if (self.data[i + 4] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_2_PREFIX { let j = i + 4; let byte = self.data[j]; return Err(Error::InvalidStringEncoding(format!( "data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_2_PREFIX:02x} (Second surogate byte prefix, b5)" ))); } if (self.data[i + 2] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX { let j = i + 2; let byte = self.data[j]; return Err(Error::InvalidStringEncoding(format!( "data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailing byte prefix, b3)" ))); } if (self.data[i + 5] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX { let j = i + 5; let byte = self.data[j]; return Err(Error::InvalidStringEncoding(format!( "data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailling byte prefix, b6)" ))); } let mut surogated_hight = ((self.data[i + 1] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6; surogated_hight |= (self.data[i + 2] & !MASK_TRAYLING_BYTE_PREFIX) as u32; let mut surogated_low = ((self.data[i + 4] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6; surogated_low |= (self.data[i + 5] & !MASK_TRAYLING_BYTE_PREFIX) as u32; let code_point = 0x10000 | (surogated_hight << 10) | (surogated_low); let slice = &self.data[i..(i + 6)]; char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!( "Invalide unicode code point in surrogated pair: {slice:02x?}: {code_point}" ))) } fn get_non_surogated(&self, i: usize, leading_bits: usize) -> Result { if leading_bits == 0 { let byte = self.data[i]; char::from_u32(byte as u32).ok_or(Error::InvalidStringEncoding(format!( "Invalide unicode code point: {byte}" ))) } else { if i + leading_bits > self.data.len() { return Err(Error::InvalidStringEncoding(format!( "Found a {leading_bits} long code point at {i} but not enought bytes" ))); } let mut code_point = (self.data[i] % (1 << (7 - leading_bits))) as u32; let slice = &self.data[i..(i + leading_bits)]; for s in 1..leading_bits { let j = i + s; let byte = self.data[j]; if (byte & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX { return Err(Error::InvalidStringEncoding(format!( "Invalid byte {byte:02x} at {j} in {leading_bits} bytes long code point {slice:02x?}" ))); } code_point <<= 6; code_point |= (byte & !MASK_TRAYLING_BYTE_PREFIX) as u32; } char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!( "Invalide unicode code point: {code_point} ({slice:02x?})" ))) } } } #[cfg(test)] mod test { use super::*; /// Test for bug found in : /// #[test] fn bug_tktech_mutf8() { let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]"; let encoded: StringDataItem = string_issue_1.into(); let expected_encoded = StringDataItem { utf16_size: Uleb128(32), data: vec![ 0x5b, 0xea, 0xb0, 0x80, 0x20, 0xeb, 0x82, 0x98, 0x20, 0xeb, 0x8b, 0xa4, 0x20, 0xeb, 0x9d, 0xbc, 0x20, 0xeb, 0xa7, 0x88, 0x20, 0xeb, 0xb0, 0x94, 0x20, 0x20, 0xec, 0x82, 0xac, 0x20, 0x20, 0xec, 0x95, 0x84, 0x20, 0xec, 0x9e, 0x90, 0x20, 0x20, 0xec, 0xb0, 0xa8, 0x20, 0xec, 0xb9, 0xb4, 0x20, 0xed, 0x83, 0x80, 0x20, 0xed, 0x8c, 0x8c, 0x20, 0xed, 0x95, 0x98, 0x5d, ], }; assert_eq!(encoded, expected_encoded); let decoded: String = encoded.try_into().unwrap(); assert_eq!(&decoded, string_issue_1); let string_issue_3 = "黑人抬棺組裝包"; let expected_encoded = StringDataItem { utf16_size: Uleb128(7), data: vec![ 0xe9, 0xbb, 0x91, 0xe4, 0xba, 0xba, 0xe6, 0x8a, 0xac, 0xe6, 0xa3, 0xba, 0xe7, 0xb5, 0x84, 0xe8, 0xa3, 0x9d, 0xe5, 0x8c, 0x85, ], }; let encoded: StringDataItem = string_issue_3.into(); assert_eq!(encoded, expected_encoded); let decoded: String = encoded.try_into().unwrap(); assert_eq!(&decoded, string_issue_3); } /// Test from , test for bad encoding /// Test for bug found in : #[test] fn test_tktech_bad_mutf8() { assert!(TryInto::::try_into(StringDataItem { utf16_size: Uleb128(0), data: vec![0x00] }) .is_err()); assert!(TryInto::::try_into(StringDataItem { utf16_size: Uleb128(0), data: vec![0xC2] }) .is_err()); assert!(TryInto::::try_into(StringDataItem { utf16_size: Uleb128(0), data: vec![0xED] }) .is_err()); assert!(TryInto::::try_into(StringDataItem { utf16_size: Uleb128(0), data: vec![0xE2] }) .is_err()); } /// Test from , test 2 bytes /// Test for bug found in : #[test] fn test_tktech_2_bytes_mutf8() { let tests = vec![ (0x0080, vec![0xc2, 0x80]), (0x0081, vec![0xc2, 0x81]), (0x0082, vec![0xc2, 0x82]), (0x0084, vec![0xc2, 0x84]), (0x0088, vec![0xc2, 0x88]), (0x0090, vec![0xc2, 0x90]), (0x00a0, vec![0xc2, 0xa0]), (0x00c0, vec![0xc3, 0x80]), (0x0180, vec![0xc6, 0x80]), (0x0280, vec![0xca, 0x80]), (0x0480, vec![0xd2, 0x80]), (0x0481, vec![0xd2, 0x81]), (0x0483, vec![0xd2, 0x83]), (0x0487, vec![0xd2, 0x87]), (0x048f, vec![0xd2, 0x8f]), (0x049f, vec![0xd2, 0x9f]), (0x04af, vec![0xd2, 0xaf]), (0x04bf, vec![0xd2, 0xbf]), (0x04ff, vec![0xd3, 0xbf]), (0x05ff, vec![0xd7, 0xbf]), (0x05ff, vec![0xd7, 0xbf]), (0x07ff, vec![0xdf, 0xbf]), ]; for (code_point, data) in tests { let encoded = StringDataItem { utf16_size: Uleb128(1), data, }; let expected = char::from_u32(code_point).unwrap().to_string(); assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); assert_eq!(encoded, expected.as_str().into()); } } /// Test from , test 3 bytes /// Test for bug found in : #[test] fn test_tktech_3_bytes_mutf8() { let tests = vec![ (0x0800, vec![0xe0, 0xa0, 0x80]), (0x0801, vec![0xe0, 0xa0, 0x81]), (0x0802, vec![0xe0, 0xa0, 0x82]), (0x0804, vec![0xe0, 0xa0, 0x84]), (0x0808, vec![0xe0, 0xa0, 0x88]), (0x0810, vec![0xe0, 0xa0, 0x90]), (0x0820, vec![0xe0, 0xa0, 0xa0]), (0x0840, vec![0xe0, 0xa1, 0x80]), (0x0880, vec![0xe0, 0xa2, 0x80]), (0x0900, vec![0xe0, 0xa4, 0x80]), (0x0a00, vec![0xe0, 0xa8, 0x80]), (0x0c00, vec![0xe0, 0xb0, 0x80]), (0x1800, vec![0xe1, 0xa0, 0x80]), (0x2800, vec![0xe2, 0xa0, 0x80]), (0x4800, vec![0xe4, 0xa0, 0x80]), (0x8800, vec![0xe8, 0xa0, 0x80]), (0x8801, vec![0xe8, 0xa0, 0x81]), (0x8803, vec![0xe8, 0xa0, 0x83]), (0x8807, vec![0xe8, 0xa0, 0x87]), (0x880f, vec![0xe8, 0xa0, 0x8f]), (0x881f, vec![0xe8, 0xa0, 0x9f]), (0x883f, vec![0xe8, 0xa0, 0xbf]), (0x887f, vec![0xe8, 0xa1, 0xbf]), (0x88ff, vec![0xe8, 0xa3, 0xbf]), (0x89ff, vec![0xe8, 0xa7, 0xbf]), (0x8bff, vec![0xe8, 0xaf, 0xbf]), (0x8fff, vec![0xe8, 0xbf, 0xbf]), (0x9fff, vec![0xe9, 0xbf, 0xbf]), (0xbfff, vec![0xeb, 0xbf, 0xbf]), (0xffff, vec![0xef, 0xbf, 0xbf]), ]; for (code_point, data) in tests { let encoded = StringDataItem { utf16_size: Uleb128(1), data, }; let expected = char::from_u32(code_point).unwrap().to_string(); assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); assert_eq!(encoded, expected.as_str().into()); } } #[test] fn test_3_bytes_mutf8() { let tests = vec![(0xac00, vec![0xea, 0xb0, 0x80])]; for (code_point, data) in tests { let encoded = StringDataItem { utf16_size: Uleb128(1), data, }; let expected = char::from_u32(code_point).unwrap().to_string(); assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); assert_eq!(encoded, expected.as_str().into()); } } /// Test from , test 6 bytes /// Test for bug found in : #[test] fn test_tktech_6_bytes_mutf8() { let tests = vec![ (0x10000, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80]), (0x10001, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81]), (0x10002, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82]), (0x10004, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84]), (0x10008, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88]), (0x10010, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90]), (0x10020, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0]), (0x10040, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80]), (0x10080, vec![0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80]), (0x10100, vec![0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80]), (0x10200, vec![0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80]), (0x10400, vec![0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80]), (0x10800, vec![0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80]), (0x11000, vec![0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80]), (0x12000, vec![0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80]), (0x14000, vec![0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80]), (0x18000, vec![0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80]), (0x30000, vec![0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80]), (0x50000, vec![0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80]), (0x90000, vec![0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80]), (0x10003, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83]), (0x10007, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87]), (0x1000f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f]), (0x1001f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f]), (0x1003f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf]), (0x1007f, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf]), (0x100ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf]), (0x101ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf]), (0x103ff, vec![0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf]), (0x107ff, vec![0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf]), (0x10fff, vec![0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf]), (0x11fff, vec![0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf]), (0x13fff, vec![0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf]), (0x17fff, vec![0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf]), (0x1ffff, vec![0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf]), (0x3ffff, vec![0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf]), (0x7ffff, vec![0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf]), (0xfffff, vec![0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf]), ]; for (code_point, data) in tests { let encoded = StringDataItem { utf16_size: Uleb128(1), data, }; let expected = char::from_u32(code_point).unwrap().to_string(); assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); assert_eq!(encoded, expected.as_str().into()); } } /// Apparently I don't know how to code an order relation so here it is... #[test] fn test_ord_relation() { let s1: StringDataItem = "Landroidx/lifecycle/WithLifecycleStateKt$suspendWithStateAtLeastUnchecked$2$observer$1;".into(); let s2: StringDataItem = "Lcom/google/android/material/search/SearchBarAnimationHelper$$ExternalSyntheticLambda4;".into(); let s1_utf16 = s1.get_aosp_utf16().unwrap(); let s2_utf16 = s2.get_aosp_utf16().unwrap(); assert_eq!(s1_utf16 < s2_utf16, true); assert_eq!(s1_utf16 == s2_utf16, false); assert_eq!(s1_utf16 > s2_utf16, false); assert_eq!(s2_utf16 < s1_utf16, false); assert_eq!(s2_utf16 == s1_utf16, false); assert_eq!(s2_utf16 > s1_utf16, true); assert_eq!(s1 < s2, true); assert_eq!(s1 == s2, false); assert_eq!(s1 > s2, false); assert_eq!(s2 < s1, false); assert_eq!(s2 == s1, false); assert_eq!(s2 > s1, true); } }