557 lines
23 KiB
Rust
557 lines
23 KiB
Rust
//! The string representation, encoded in MUTF-8
|
|
//! <https://source.android.com/docs/core/runtime/dex-format#mutf-8>
|
|
//!
|
|
//! The encoding of codepoint in MUTF-8 is as following (table from
|
|
//! <https://py2jdbc.readthedocs.io/en/latest/mutf8.html>):
|
|
//!
|
|
//! | Number of bytes | First code point | Last code point | Bits | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Byte 6 |
|
|
//! |-----------------|------------------|-----------------|------|----------|----------|----------|----------|----------|----------|
|
|
//! | 2 | U+0000 | U+0000 | - | 11000000 | 10000000 | | | | |
|
|
//! | 1 | U+0001 | U+007F | 7 | 0xxxxxxx | | | | | |
|
|
//! | 2 | U+0080 | U+07FF | 11 | 110xxxxx | 10xxxxxx | | | | |
|
|
//! | 3 | U+0800 | U+FFFF | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | | | |
|
|
//! | 6 | U+10000 | U+FFFFF | 20 | 11101101 | 1010xxxx | 10xxxxxx | 11101101 | 1011xxxx | 10xxxxxx |
|
|
|
|
use std::cmp::{Ord, Ordering, PartialOrd};
|
|
|
|
use crate as androscalpel_serializer;
|
|
use crate::core::*;
|
|
pub use androscalpel_serializer_derive::*;
|
|
|
|
/// <https://source.android.com/docs/core/runtime/dex-format#string-data-item>
|
|
#[derive(Serializable, Clone, PartialEq, Eq, Debug)]
|
|
pub struct StringDataItem {
|
|
pub utf16_size: Uleb128,
|
|
#[until(u8, u8, 0x00u8)]
|
|
pub data: Vec<u8>,
|
|
}
|
|
|
|
const TERMINATION_BYTE: u8 = 0;
|
|
const SURROGATE_BYTE: u8 = 0b1110_1101;
|
|
const MASK_SURROGATED_BYTE_PREFIX: u8 = 0b1111_0000;
|
|
const VALUE_SURROGATED_BYTE_1_PREFIX: u8 = 0b1010_0000;
|
|
const VALUE_SURROGATED_BYTE_2_PREFIX: u8 = 0b1011_0000;
|
|
const MASK_TRAYLING_BYTE_PREFIX: u8 = 0b1100_0000;
|
|
const VALUE_TRAYLING_BYTE_PREFIX: u8 = 0b1000_0000;
|
|
|
|
impl Ord for StringDataItem {
|
|
fn cmp(&self, other: &Self) -> Ordering {
|
|
self.get_aosp_utf16()
|
|
.unwrap()
|
|
.cmp(&other.get_aosp_utf16().unwrap())
|
|
.then(self.utf16_size.cmp(&other.utf16_size))
|
|
}
|
|
}
|
|
|
|
impl PartialOrd for StringDataItem {
|
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
Some(self.cmp(other))
|
|
}
|
|
}
|
|
|
|
impl TryFrom<&StringDataItem> for String {
|
|
type Error = Error;
|
|
fn try_from(item: &StringDataItem) -> Result<String> {
|
|
item.get_string()
|
|
}
|
|
}
|
|
impl TryFrom<StringDataItem> for String {
|
|
type Error = Error;
|
|
fn try_from(item: StringDataItem) -> Result<String> {
|
|
item.get_string()
|
|
}
|
|
}
|
|
|
|
impl From<&str> for StringDataItem {
|
|
fn from(string: &str) -> Self {
|
|
let mut ret_data = vec![];
|
|
let mut data = vec![];
|
|
let mut size = 0;
|
|
for chr in string.chars() {
|
|
let code_point: u32 = chr.into();
|
|
if code_point == 0 {
|
|
data.push(0b1100_0000);
|
|
data.push(0b1000_0000);
|
|
} else if code_point <= 0x7F {
|
|
data.push(code_point as u8);
|
|
} else if code_point <= 0x7FF {
|
|
data.push(0b1100_0000 | (((code_point >> 6) & 0b0001_1111) as u8));
|
|
data.push(
|
|
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
|
|
);
|
|
} else if code_point <= 0xFFFF {
|
|
data.push(0b1110_0000 | (((code_point >> 12) & 0b0000_1111) as u8));
|
|
data.push(
|
|
VALUE_TRAYLING_BYTE_PREFIX
|
|
| (((code_point >> 6) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
|
|
);
|
|
data.push(
|
|
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
|
|
);
|
|
} else if code_point <= 0xFFFFF {
|
|
data.push(SURROGATE_BYTE);
|
|
data.push(
|
|
VALUE_SURROGATED_BYTE_1_PREFIX
|
|
| (((code_point >> 16) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
|
|
);
|
|
data.push(
|
|
VALUE_TRAYLING_BYTE_PREFIX
|
|
| (((code_point >> 10) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
|
|
);
|
|
data.push(SURROGATE_BYTE);
|
|
data.push(
|
|
VALUE_SURROGATED_BYTE_2_PREFIX
|
|
| (((code_point >> 6) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
|
|
);
|
|
data.push(
|
|
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
|
|
);
|
|
} else {
|
|
panic!("Code Point {code_point} for char {chr} is to big");
|
|
}
|
|
for b in data {
|
|
ret_data.push(b);
|
|
}
|
|
data = vec![];
|
|
size += 1;
|
|
}
|
|
Self {
|
|
utf16_size: Uleb128(size),
|
|
data: ret_data,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl StringDataItem {
|
|
/// Return the utf-16 string used by google in the aosp for comparaison & other.
|
|
fn get_aosp_utf16(&self) -> Result<Vec<u32>> {
|
|
let mut utf16_string = vec![];
|
|
let mut i = 0;
|
|
while i < self.data.len() {
|
|
let one = self.data[i] as u32;
|
|
i += 1;
|
|
if one & 0x80 == 0 {
|
|
utf16_string.push(one);
|
|
continue;
|
|
}
|
|
|
|
if i >= self.data.len() {
|
|
return Err(Error::InvalidStringEncoding(
|
|
"String contains invalid caracters".into(),
|
|
));
|
|
}
|
|
let two = self.data[i] as u32;
|
|
i += 1;
|
|
if one & 0x20 == 0 {
|
|
utf16_string.push((one & 0x1f) << 6 | (two & 0x3f));
|
|
continue;
|
|
}
|
|
|
|
if i >= self.data.len() {
|
|
return Err(Error::InvalidStringEncoding(
|
|
"String contains invalid caracters".into(),
|
|
));
|
|
}
|
|
let three = self.data[i] as u32;
|
|
i += 1;
|
|
if one & 0x10 == 0 {
|
|
utf16_string.push((one & 0x0f) << 12 | (two & 0x3f) << 6 | (three & 0x3f));
|
|
continue;
|
|
}
|
|
|
|
if i >= self.data.len() {
|
|
return Err(Error::InvalidStringEncoding(
|
|
"String contains invalid caracters".into(),
|
|
));
|
|
}
|
|
let four = self.data[i] as u32;
|
|
i += 1;
|
|
let code_point =
|
|
(one & 0x0f) << 18 | (two & 0x3f) << 12 | (three & 0x3f) << 6 | (four & 0x3f);
|
|
let mut pair = ((code_point >> 10) + 0xd7c0) & 0xffff;
|
|
pair |= ((code_point & 0x03ff) + 0xdc00) << 16;
|
|
utf16_string.push(pair);
|
|
}
|
|
Ok(utf16_string)
|
|
}
|
|
fn get_string(&self) -> Result<String> {
|
|
let mut string = String::new();
|
|
let mut i = 0;
|
|
while i < self.data.len() {
|
|
if self.data[i] == TERMINATION_BYTE {
|
|
return Err(Error::InvalidStringEncoding(
|
|
"String should not contains null bytes".into(),
|
|
));
|
|
}
|
|
if self.data[i] == SURROGATE_BYTE {
|
|
let res = self.get_surrogate(i);
|
|
match res {
|
|
Ok(chr) => {
|
|
string.push(chr);
|
|
i += 6;
|
|
continue;
|
|
}
|
|
Err(err) if i + 3 > self.data.len() => {
|
|
return Err(err);
|
|
}
|
|
Err(_) => (),
|
|
}
|
|
// Else, it may be a 3 bytes character...
|
|
}
|
|
let mut leading_bits = 0;
|
|
for j in 0..8 {
|
|
if (self.data[i] & (1 << (7 - j))) != 0 {
|
|
leading_bits += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if leading_bits == 1 || leading_bits > 3 {
|
|
let byte = self.data[i];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character"
|
|
)));
|
|
}
|
|
|
|
string.push(self.get_non_surogated(i, leading_bits)?);
|
|
if leading_bits == 0 {
|
|
i += 1;
|
|
} else {
|
|
i += leading_bits;
|
|
}
|
|
}
|
|
Ok(string)
|
|
}
|
|
|
|
fn get_surrogate(&self, i: usize) -> Result<char> {
|
|
if i + 6 > self.data.len() {
|
|
return Err(Error::InvalidStringEncoding(
|
|
"Found surogate byte, but not enought bytes left to form a surogate pair".into(),
|
|
));
|
|
}
|
|
if self.data[i] != SURROGATE_BYTE {
|
|
let byte = self.data[i];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{i}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b1)"
|
|
)));
|
|
}
|
|
if self.data[i + 3] != SURROGATE_BYTE {
|
|
let j = i + 3;
|
|
let byte = self.data[j];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{j}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b4)"
|
|
)));
|
|
}
|
|
if (self.data[i + 1] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_1_PREFIX {
|
|
let j = i + 1;
|
|
let byte = self.data[j];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_1_PREFIX:02x} (First surogate byte prefix, b2)"
|
|
)));
|
|
}
|
|
if (self.data[i + 4] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_2_PREFIX {
|
|
let j = i + 4;
|
|
let byte = self.data[j];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_2_PREFIX:02x} (Second surogate byte prefix, b5)"
|
|
)));
|
|
}
|
|
if (self.data[i + 2] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
|
|
let j = i + 2;
|
|
let byte = self.data[j];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailing byte prefix, b3)"
|
|
)));
|
|
}
|
|
if (self.data[i + 5] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
|
|
let j = i + 5;
|
|
let byte = self.data[j];
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailling byte prefix, b6)"
|
|
)));
|
|
}
|
|
|
|
let mut surogated_hight = ((self.data[i + 1] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
|
|
surogated_hight |= (self.data[i + 2] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
|
|
let mut surogated_low = ((self.data[i + 4] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
|
|
surogated_low |= (self.data[i + 5] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
|
|
let code_point = 0x10000 | (surogated_hight << 10) | (surogated_low);
|
|
|
|
let slice = &self.data[i..(i + 6)];
|
|
char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
|
|
"Invalide unicode code point in surrogated pair: {slice:02x?}: {code_point}"
|
|
)))
|
|
}
|
|
|
|
fn get_non_surogated(&self, i: usize, leading_bits: usize) -> Result<char> {
|
|
if leading_bits == 0 {
|
|
let byte = self.data[i];
|
|
char::from_u32(byte as u32).ok_or(Error::InvalidStringEncoding(format!(
|
|
"Invalide unicode code point: {byte}"
|
|
)))
|
|
} else {
|
|
if i + leading_bits > self.data.len() {
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"Found a {leading_bits} long code point at {i} but not enought bytes"
|
|
)));
|
|
}
|
|
let mut code_point = (self.data[i] % (1 << (7 - leading_bits))) as u32;
|
|
let slice = &self.data[i..(i + leading_bits)];
|
|
for s in 1..leading_bits {
|
|
let j = i + s;
|
|
let byte = self.data[j];
|
|
if (byte & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
|
|
return Err(Error::InvalidStringEncoding(format!(
|
|
"Invalid byte {byte:02x} at {j} in {leading_bits} bytes long code point {slice:02x?}"
|
|
)));
|
|
}
|
|
code_point <<= 6;
|
|
code_point |= (byte & !MASK_TRAYLING_BYTE_PREFIX) as u32;
|
|
}
|
|
char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
|
|
"Invalide unicode code point: {code_point} ({slice:02x?})"
|
|
)))
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
|
|
/// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
|
|
/// <https://github.com/TkTech/mutf8/blob/master/tests/test_bugs.py>
|
|
#[test]
|
|
fn bug_tktech_mutf8() {
|
|
let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]";
|
|
let encoded: StringDataItem = string_issue_1.into();
|
|
let expected_encoded = StringDataItem {
|
|
utf16_size: Uleb128(32),
|
|
data: vec![
|
|
0x5b, 0xea, 0xb0, 0x80, 0x20, 0xeb, 0x82, 0x98, 0x20, 0xeb, 0x8b, 0xa4, 0x20, 0xeb,
|
|
0x9d, 0xbc, 0x20, 0xeb, 0xa7, 0x88, 0x20, 0xeb, 0xb0, 0x94, 0x20, 0x20, 0xec, 0x82,
|
|
0xac, 0x20, 0x20, 0xec, 0x95, 0x84, 0x20, 0xec, 0x9e, 0x90, 0x20, 0x20, 0xec, 0xb0,
|
|
0xa8, 0x20, 0xec, 0xb9, 0xb4, 0x20, 0xed, 0x83, 0x80, 0x20, 0xed, 0x8c, 0x8c, 0x20,
|
|
0xed, 0x95, 0x98, 0x5d,
|
|
],
|
|
};
|
|
assert_eq!(encoded, expected_encoded);
|
|
let decoded: String = encoded.try_into().unwrap();
|
|
assert_eq!(&decoded, string_issue_1);
|
|
|
|
let string_issue_3 = "黑人抬棺組裝包";
|
|
let expected_encoded = StringDataItem {
|
|
utf16_size: Uleb128(7),
|
|
data: vec![
|
|
0xe9, 0xbb, 0x91, 0xe4, 0xba, 0xba, 0xe6, 0x8a, 0xac, 0xe6, 0xa3, 0xba, 0xe7, 0xb5,
|
|
0x84, 0xe8, 0xa3, 0x9d, 0xe5, 0x8c, 0x85,
|
|
],
|
|
};
|
|
let encoded: StringDataItem = string_issue_3.into();
|
|
assert_eq!(encoded, expected_encoded);
|
|
let decoded: String = encoded.try_into().unwrap();
|
|
assert_eq!(&decoded, string_issue_3);
|
|
}
|
|
|
|
/// Test from <https://github.com/TkTech/mutf8/tree/master>, test for bad encoding
|
|
/// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
|
|
#[test]
|
|
fn test_tktech_bad_mutf8() {
|
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
|
utf16_size: Uleb128(0),
|
|
data: vec![0x00]
|
|
})
|
|
.is_err());
|
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
|
utf16_size: Uleb128(0),
|
|
data: vec![0xC2]
|
|
})
|
|
.is_err());
|
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
|
utf16_size: Uleb128(0),
|
|
data: vec![0xED]
|
|
})
|
|
.is_err());
|
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
|
utf16_size: Uleb128(0),
|
|
data: vec![0xE2]
|
|
})
|
|
.is_err());
|
|
}
|
|
|
|
/// Test from <https://github.com/TkTech/mutf8/tree/master>, test 2 bytes
|
|
/// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
|
|
#[test]
|
|
fn test_tktech_2_bytes_mutf8() {
|
|
let tests = vec![
|
|
(0x0080, vec![0xc2, 0x80]),
|
|
(0x0081, vec![0xc2, 0x81]),
|
|
(0x0082, vec![0xc2, 0x82]),
|
|
(0x0084, vec![0xc2, 0x84]),
|
|
(0x0088, vec![0xc2, 0x88]),
|
|
(0x0090, vec![0xc2, 0x90]),
|
|
(0x00a0, vec![0xc2, 0xa0]),
|
|
(0x00c0, vec![0xc3, 0x80]),
|
|
(0x0180, vec![0xc6, 0x80]),
|
|
(0x0280, vec![0xca, 0x80]),
|
|
(0x0480, vec![0xd2, 0x80]),
|
|
(0x0481, vec![0xd2, 0x81]),
|
|
(0x0483, vec![0xd2, 0x83]),
|
|
(0x0487, vec![0xd2, 0x87]),
|
|
(0x048f, vec![0xd2, 0x8f]),
|
|
(0x049f, vec![0xd2, 0x9f]),
|
|
(0x04af, vec![0xd2, 0xaf]),
|
|
(0x04bf, vec![0xd2, 0xbf]),
|
|
(0x04ff, vec![0xd3, 0xbf]),
|
|
(0x05ff, vec![0xd7, 0xbf]),
|
|
(0x05ff, vec![0xd7, 0xbf]),
|
|
(0x07ff, vec![0xdf, 0xbf]),
|
|
];
|
|
for (code_point, data) in tests {
|
|
let encoded = StringDataItem {
|
|
utf16_size: Uleb128(1),
|
|
data,
|
|
};
|
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
|
assert_eq!(encoded, expected.as_str().into());
|
|
}
|
|
}
|
|
|
|
/// Test from <https://github.com/TkTech/mutf8/tree/master>, test 3 bytes
|
|
/// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
|
|
#[test]
|
|
fn test_tktech_3_bytes_mutf8() {
|
|
let tests = vec![
|
|
(0x0800, vec![0xe0, 0xa0, 0x80]),
|
|
(0x0801, vec![0xe0, 0xa0, 0x81]),
|
|
(0x0802, vec![0xe0, 0xa0, 0x82]),
|
|
(0x0804, vec![0xe0, 0xa0, 0x84]),
|
|
(0x0808, vec![0xe0, 0xa0, 0x88]),
|
|
(0x0810, vec![0xe0, 0xa0, 0x90]),
|
|
(0x0820, vec![0xe0, 0xa0, 0xa0]),
|
|
(0x0840, vec![0xe0, 0xa1, 0x80]),
|
|
(0x0880, vec![0xe0, 0xa2, 0x80]),
|
|
(0x0900, vec![0xe0, 0xa4, 0x80]),
|
|
(0x0a00, vec![0xe0, 0xa8, 0x80]),
|
|
(0x0c00, vec![0xe0, 0xb0, 0x80]),
|
|
(0x1800, vec![0xe1, 0xa0, 0x80]),
|
|
(0x2800, vec![0xe2, 0xa0, 0x80]),
|
|
(0x4800, vec![0xe4, 0xa0, 0x80]),
|
|
(0x8800, vec![0xe8, 0xa0, 0x80]),
|
|
(0x8801, vec![0xe8, 0xa0, 0x81]),
|
|
(0x8803, vec![0xe8, 0xa0, 0x83]),
|
|
(0x8807, vec![0xe8, 0xa0, 0x87]),
|
|
(0x880f, vec![0xe8, 0xa0, 0x8f]),
|
|
(0x881f, vec![0xe8, 0xa0, 0x9f]),
|
|
(0x883f, vec![0xe8, 0xa0, 0xbf]),
|
|
(0x887f, vec![0xe8, 0xa1, 0xbf]),
|
|
(0x88ff, vec![0xe8, 0xa3, 0xbf]),
|
|
(0x89ff, vec![0xe8, 0xa7, 0xbf]),
|
|
(0x8bff, vec![0xe8, 0xaf, 0xbf]),
|
|
(0x8fff, vec![0xe8, 0xbf, 0xbf]),
|
|
(0x9fff, vec![0xe9, 0xbf, 0xbf]),
|
|
(0xbfff, vec![0xeb, 0xbf, 0xbf]),
|
|
(0xffff, vec![0xef, 0xbf, 0xbf]),
|
|
];
|
|
for (code_point, data) in tests {
|
|
let encoded = StringDataItem {
|
|
utf16_size: Uleb128(1),
|
|
data,
|
|
};
|
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
|
assert_eq!(encoded, expected.as_str().into());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_3_bytes_mutf8() {
|
|
let tests = vec![(0xac00, vec![0xea, 0xb0, 0x80])];
|
|
for (code_point, data) in tests {
|
|
let encoded = StringDataItem {
|
|
utf16_size: Uleb128(1),
|
|
data,
|
|
};
|
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
|
assert_eq!(encoded, expected.as_str().into());
|
|
}
|
|
}
|
|
|
|
/// Test from <https://github.com/TkTech/mutf8/tree/master>, test 6 bytes
|
|
/// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
|
|
#[test]
|
|
fn test_tktech_6_bytes_mutf8() {
|
|
let tests = vec![
|
|
(0x10000, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80]),
|
|
(0x10001, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81]),
|
|
(0x10002, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82]),
|
|
(0x10004, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84]),
|
|
(0x10008, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88]),
|
|
(0x10010, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90]),
|
|
(0x10020, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0]),
|
|
(0x10040, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80]),
|
|
(0x10080, vec![0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80]),
|
|
(0x10100, vec![0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80]),
|
|
(0x10200, vec![0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80]),
|
|
(0x10400, vec![0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80]),
|
|
(0x10800, vec![0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80]),
|
|
(0x11000, vec![0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80]),
|
|
(0x12000, vec![0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80]),
|
|
(0x14000, vec![0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80]),
|
|
(0x18000, vec![0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80]),
|
|
(0x30000, vec![0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80]),
|
|
(0x50000, vec![0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80]),
|
|
(0x90000, vec![0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80]),
|
|
(0x10003, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83]),
|
|
(0x10007, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87]),
|
|
(0x1000f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f]),
|
|
(0x1001f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f]),
|
|
(0x1003f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf]),
|
|
(0x1007f, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf]),
|
|
(0x100ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf]),
|
|
(0x101ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf]),
|
|
(0x103ff, vec![0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf]),
|
|
(0x107ff, vec![0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf]),
|
|
(0x10fff, vec![0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf]),
|
|
(0x11fff, vec![0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf]),
|
|
(0x13fff, vec![0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf]),
|
|
(0x17fff, vec![0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf]),
|
|
(0x1ffff, vec![0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf]),
|
|
(0x3ffff, vec![0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf]),
|
|
(0x7ffff, vec![0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf]),
|
|
(0xfffff, vec![0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf]),
|
|
];
|
|
for (code_point, data) in tests {
|
|
let encoded = StringDataItem {
|
|
utf16_size: Uleb128(1),
|
|
data,
|
|
};
|
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
|
assert_eq!(encoded, expected.as_str().into());
|
|
}
|
|
}
|
|
|
|
/// Apparently I don't know how to code an order relation so here it is...
|
|
#[test]
|
|
fn test_ord_relation() {
|
|
let s1: StringDataItem = "Landroidx/lifecycle/WithLifecycleStateKt$suspendWithStateAtLeastUnchecked$2$observer$1;".into();
|
|
let s2: StringDataItem = "Lcom/google/android/material/search/SearchBarAnimationHelper$$ExternalSyntheticLambda4;".into();
|
|
let s1_utf16 = s1.get_aosp_utf16().unwrap();
|
|
let s2_utf16 = s2.get_aosp_utf16().unwrap();
|
|
assert_eq!(s1_utf16 < s2_utf16, true);
|
|
assert_eq!(s1_utf16 == s2_utf16, false);
|
|
assert_eq!(s1_utf16 > s2_utf16, false);
|
|
assert_eq!(s2_utf16 < s1_utf16, false);
|
|
assert_eq!(s2_utf16 == s1_utf16, false);
|
|
assert_eq!(s2_utf16 > s1_utf16, true);
|
|
assert_eq!(s1 < s2, true);
|
|
assert_eq!(s1 == s2, false);
|
|
assert_eq!(s1 > s2, false);
|
|
assert_eq!(s2 < s1, false);
|
|
assert_eq!(s2 == s1, false);
|
|
assert_eq!(s2 > s1, true);
|
|
}
|
|
}
|