WIP mutf8
This commit is contained in:
parent
d44e2b624b
commit
24f4b0b46d
3 changed files with 459 additions and 2 deletions
|
|
@ -6,13 +6,16 @@ use std::io::{Cursor, Read, Seek, SeekFrom, Write};
|
||||||
pub use androscalpel_serializer_derive::*;
|
pub use androscalpel_serializer_derive::*;
|
||||||
|
|
||||||
pub mod leb;
|
pub mod leb;
|
||||||
|
pub mod string;
|
||||||
pub use leb::*;
|
pub use leb::*;
|
||||||
|
pub use string::*;
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
InputTooSmall(String), // TODO: find a better name
|
InputTooSmall(String), // TODO: find a better name
|
||||||
SerializationError(String),
|
SerializationError(String),
|
||||||
DeserializationError(String),
|
DeserializationError(String),
|
||||||
|
InvalidStringEncoding(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type Result<T> = core::result::Result<T, Error>;
|
pub type Result<T> = core::result::Result<T, Error>;
|
||||||
|
|
@ -26,6 +29,7 @@ impl std::fmt::Display for Error {
|
||||||
Self::InputTooSmall(msg) => write!(f, "Error: {}", msg),
|
Self::InputTooSmall(msg) => write!(f, "Error: {}", msg),
|
||||||
Self::SerializationError(msg) => write!(f, "Error: {}", msg),
|
Self::SerializationError(msg) => write!(f, "Error: {}", msg),
|
||||||
Self::DeserializationError(msg) => write!(f, "Error: {}", msg),
|
Self::DeserializationError(msg) => write!(f, "Error: {}", msg),
|
||||||
|
Self::InvalidStringEncoding(msg) => write!(f, "Error: {}", msg),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
453
androscalpel_serializer/src/core/string.rs
Normal file
453
androscalpel_serializer/src/core/string.rs
Normal file
|
|
@ -0,0 +1,453 @@
|
||||||
|
//! The string representation, encoded in MUTF-8
|
||||||
|
//! https://source.android.com/docs/core/runtime/dex-format#mutf-8
|
||||||
|
//!
|
||||||
|
//! The encoding of codepoint in MUTF-8 is as following (table from
|
||||||
|
//! <https://py2jdbc.readthedocs.io/en/latest/mutf8.html>):
|
||||||
|
//!
|
||||||
|
//! | Number of bytes | First code point | Last code point | Bits | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Byte 6 |
|
||||||
|
//! |-----------------|------------------|-----------------|------|----------|----------|----------|----------|----------|----------|
|
||||||
|
//! | 2 | U+0000 | U+0000 | - | 11000000 | 10000000 | | | | |
|
||||||
|
//! | 1 | U+0001 | U+007F | 7 | 0xxxxxxx | | | | | |
|
||||||
|
//! | 2 | U+0080 | U+07FF | 11 | 110xxxxx | 10xxxxxx | | | | |
|
||||||
|
//! | 3 | U+0800 | U+FFFF | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | | | |
|
||||||
|
//! | 6 | U+10000 | U+FFFFF | 20 | 11101101 | 1010xxxx | 10xxxxxx | 11101101 | 1011xxxx | 10xxxxxx |
|
||||||
|
|
||||||
|
use crate as androscalpel_serializer;
|
||||||
|
use crate::core::*;
|
||||||
|
pub use androscalpel_serializer_derive::*;
|
||||||
|
|
||||||
|
/// [string-data-item](https://source.android.com/docs/core/runtime/dex-format#string-data-item)
|
||||||
|
#[derive(Serializable, PartialEq, Eq, Debug)]
|
||||||
|
pub struct StringDataItem {
|
||||||
|
pub utf16_size: Uleb128,
|
||||||
|
#[until(u8, u8, 0x00u8)]
|
||||||
|
pub data: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
const TERMINATION_BYTE: u8 = 0;
|
||||||
|
const SURROGATE_BYTE: u8 = 0b1110_1101;
|
||||||
|
const MASK_SURROGATED_BYTE_PREFIX: u8 = 0b1111_0000;
|
||||||
|
const VALUE_SURROGATED_BYTE_1_PREFIX: u8 = 0b1010_0000;
|
||||||
|
const VALUE_SURROGATED_BYTE_2_PREFIX: u8 = 0b1011_0000;
|
||||||
|
const MASK_TRAYLING_BYTE_PREFIX: u8 = 0b1100_0000;
|
||||||
|
const VALUE_TRAYLING_BYTE_PREFIX: u8 = 0b1000_0000;
|
||||||
|
|
||||||
|
impl TryFrom<&StringDataItem> for String {
|
||||||
|
type Error = Error;
|
||||||
|
fn try_from(item: &StringDataItem) -> Result<String> {
|
||||||
|
item.get_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl TryFrom<StringDataItem> for String {
|
||||||
|
type Error = Error;
|
||||||
|
fn try_from(item: StringDataItem) -> Result<String> {
|
||||||
|
item.get_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&str> for StringDataItem {
|
||||||
|
fn from(string: &str) -> Self {
|
||||||
|
let mut ret_data = vec![];
|
||||||
|
let mut data = vec![];
|
||||||
|
let mut size = 0;
|
||||||
|
for chr in string.chars() {
|
||||||
|
let code_point: u32 = chr.into();
|
||||||
|
if code_point == 0 {
|
||||||
|
data.push(0b1100_0000);
|
||||||
|
data.push(0b1000_0000);
|
||||||
|
} else if code_point <= 0x7F {
|
||||||
|
data.push(code_point as u8);
|
||||||
|
} else if code_point <= 0x7FF {
|
||||||
|
data.push(0b1100_0000 | (((code_point >> 6) & 0b0001_1111) as u8));
|
||||||
|
data.push(
|
||||||
|
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
|
||||||
|
);
|
||||||
|
} else if code_point <= 0xFFFF {
|
||||||
|
data.push(0b1110_0000 | (((code_point >> 12) & 0b0000_1111) as u8));
|
||||||
|
data.push(
|
||||||
|
VALUE_TRAYLING_BYTE_PREFIX
|
||||||
|
| (((code_point >> 6) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
|
||||||
|
);
|
||||||
|
data.push(
|
||||||
|
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
|
||||||
|
);
|
||||||
|
} else if code_point <= 0xFFFFF {
|
||||||
|
data.push(SURROGATE_BYTE);
|
||||||
|
data.push(
|
||||||
|
VALUE_SURROGATED_BYTE_1_PREFIX
|
||||||
|
| (((code_point >> 16) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
|
||||||
|
);
|
||||||
|
data.push(
|
||||||
|
VALUE_TRAYLING_BYTE_PREFIX
|
||||||
|
| (((code_point >> 10) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
|
||||||
|
);
|
||||||
|
data.push(SURROGATE_BYTE);
|
||||||
|
data.push(
|
||||||
|
VALUE_SURROGATED_BYTE_2_PREFIX
|
||||||
|
| (((code_point >> 6) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
|
||||||
|
);
|
||||||
|
data.push(
|
||||||
|
VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
panic!("Code Point {code_point} for char {chr} is to big");
|
||||||
|
}
|
||||||
|
let slice = &data[..];
|
||||||
|
println!("0x{code_point:x} -> {slice:x?}");
|
||||||
|
for b in data {
|
||||||
|
ret_data.push(b);
|
||||||
|
}
|
||||||
|
data = vec![];
|
||||||
|
size += 1;
|
||||||
|
}
|
||||||
|
Self {
|
||||||
|
utf16_size: Uleb128(size),
|
||||||
|
data: ret_data,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StringDataItem {
|
||||||
|
fn get_string(&self) -> Result<String> {
|
||||||
|
let mut string = String::new();
|
||||||
|
let mut i = 0;
|
||||||
|
while i < self.data.len() {
|
||||||
|
if self.data[i] == TERMINATION_BYTE {
|
||||||
|
return Err(Error::InvalidStringEncoding(
|
||||||
|
"String should not contains null bytes".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if self.data[i] == SURROGATE_BYTE {
|
||||||
|
let res = self.get_surrogate(i);
|
||||||
|
match res {
|
||||||
|
Ok(chr) => {
|
||||||
|
string.push(chr);
|
||||||
|
i += 6;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Err(err) if i + 3 > self.data.len() => {
|
||||||
|
return Err(err);
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
// Else, it may be a 3 bytes character...
|
||||||
|
}
|
||||||
|
let mut leading_bits = 0;
|
||||||
|
for i in 0..8 {
|
||||||
|
if (self.data[0] & (1 << (7 - i))) != 0 {
|
||||||
|
leading_bits += 1;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if leading_bits == 1 || leading_bits > 3 {
|
||||||
|
let byte = self.data[i];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
string.push(self.get_non_surogated(i, leading_bits)?);
|
||||||
|
if leading_bits == 0 {
|
||||||
|
i += 1;
|
||||||
|
} else {
|
||||||
|
i += leading_bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(string)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_surrogate(&self, i: usize) -> Result<char> {
|
||||||
|
if i + 6 > self.data.len() {
|
||||||
|
return Err(Error::InvalidStringEncoding(
|
||||||
|
"Found surogate byte, but not enought bytes left to form a surogate pair".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if self.data[i] != SURROGATE_BYTE {
|
||||||
|
let byte = self.data[i];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{i}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b1)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if self.data[i + 3] != SURROGATE_BYTE {
|
||||||
|
let j = i + 3;
|
||||||
|
let byte = self.data[j];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{j}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b4)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if (self.data[i + 1] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_1_PREFIX {
|
||||||
|
let j = i + 1;
|
||||||
|
let byte = self.data[j];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_1_PREFIX:02x} (First surogate byte prefix, b2)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if (self.data[i + 4] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_2_PREFIX {
|
||||||
|
let j = i + 4;
|
||||||
|
let byte = self.data[j];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_2_PREFIX:02x} (Second surogate byte prefix, b5)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if (self.data[i + 2] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
|
||||||
|
let j = i + 2;
|
||||||
|
let byte = self.data[j];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailing byte prefix, b3)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if (self.data[i + 5] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
|
||||||
|
let j = i + 5;
|
||||||
|
let byte = self.data[j];
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailling byte prefix, b6)"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut surogated_hight = ((self.data[i + 1] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
|
||||||
|
surogated_hight |= (self.data[i + 2] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
|
||||||
|
let mut surogated_low = ((self.data[i + 4] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
|
||||||
|
surogated_low |= (self.data[i + 5] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
|
||||||
|
let code_point = 0x10000 | (surogated_hight << 10) | (surogated_low);
|
||||||
|
|
||||||
|
let slice = &self.data[i..(i + 6)];
|
||||||
|
char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
|
||||||
|
"Invalide unicode code point in surrogated pair: {slice:02x?}: {code_point}"
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_non_surogated(&self, i: usize, leading_bits: usize) -> Result<char> {
|
||||||
|
if leading_bits == 0 {
|
||||||
|
let byte = self.data[i];
|
||||||
|
char::from_u32(byte as u32).ok_or(Error::InvalidStringEncoding(format!(
|
||||||
|
"Invalide unicode code point: {byte}"
|
||||||
|
)))
|
||||||
|
} else {
|
||||||
|
if i + leading_bits > self.data.len() {
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"Found a {leading_bits} long code point at {i} but not enought bytes"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let mut code_point = (self.data[0] % (1 << (7 - leading_bits))) as u32;
|
||||||
|
let slice = &self.data[i..(i + leading_bits)];
|
||||||
|
for s in 1..leading_bits {
|
||||||
|
let j = i + s;
|
||||||
|
let byte = self.data[j];
|
||||||
|
if (byte & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
|
||||||
|
return Err(Error::InvalidStringEncoding(format!(
|
||||||
|
"Invalid byte {byte:02x} at {j} in {leading_bits} bytes long code point {slice:02x?}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
code_point <<= 6;
|
||||||
|
code_point |= (byte & !MASK_TRAYLING_BYTE_PREFIX) as u32;
|
||||||
|
}
|
||||||
|
char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
|
||||||
|
"Invalide unicode code point: {code_point} ({slice:02x?})"
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
|
||||||
|
/// https://github.com/TkTech/mutf8/blob/master/tests/test_bugs.py
|
||||||
|
#[test]
|
||||||
|
fn bug_tktech_mutf8() {
|
||||||
|
let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]";
|
||||||
|
let encoded: StringDataItem = string_issue_1.into();
|
||||||
|
println!("encoded:");
|
||||||
|
for c in &encoded.data {
|
||||||
|
print!("0x{c:02x} ");
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
let decoded: String = encoded.try_into().unwrap();
|
||||||
|
assert_eq!(&decoded, string_issue_1);
|
||||||
|
/*
|
||||||
|
let string_issue_3 = "黑人抬棺組裝包";
|
||||||
|
let encoded: StringDataItem = string_issue_2.into();
|
||||||
|
let decoded: String = encoded.try_into().unwrap();
|
||||||
|
println!("encoded:");
|
||||||
|
for c in decoded.chars() {
|
||||||
|
let val: u32 = c.into();
|
||||||
|
print!("0x{val:08x} ");
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
for c in string_issue_2.chars() {
|
||||||
|
let val: u32 = c.into();
|
||||||
|
print!("0x{val:08x} ");
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
assert_eq!(&decoded, string_issue_2);
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test from https://github.com/TkTech/mutf8/tree/master, test for bad encoding
|
||||||
|
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
|
||||||
|
#[test]
|
||||||
|
fn test_tktech_bad_mutf8() {
|
||||||
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
||||||
|
utf16_size: Uleb128(0),
|
||||||
|
data: vec![0x00]
|
||||||
|
})
|
||||||
|
.is_err());
|
||||||
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
||||||
|
utf16_size: Uleb128(0),
|
||||||
|
data: vec![0xC2]
|
||||||
|
})
|
||||||
|
.is_err());
|
||||||
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
||||||
|
utf16_size: Uleb128(0),
|
||||||
|
data: vec![0xED]
|
||||||
|
})
|
||||||
|
.is_err());
|
||||||
|
assert!(TryInto::<String>::try_into(StringDataItem {
|
||||||
|
utf16_size: Uleb128(0),
|
||||||
|
data: vec![0xE2]
|
||||||
|
})
|
||||||
|
.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test from https://github.com/TkTech/mutf8/tree/master, test 2 bytes
|
||||||
|
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
|
||||||
|
#[test]
|
||||||
|
fn test_tktech_2_bytes_mutf8() {
|
||||||
|
let tests = vec![
|
||||||
|
(0x0080, vec![0xc2, 0x80]),
|
||||||
|
(0x0081, vec![0xc2, 0x81]),
|
||||||
|
(0x0082, vec![0xc2, 0x82]),
|
||||||
|
(0x0084, vec![0xc2, 0x84]),
|
||||||
|
(0x0088, vec![0xc2, 0x88]),
|
||||||
|
(0x0090, vec![0xc2, 0x90]),
|
||||||
|
(0x00a0, vec![0xc2, 0xa0]),
|
||||||
|
(0x00c0, vec![0xc3, 0x80]),
|
||||||
|
(0x0180, vec![0xc6, 0x80]),
|
||||||
|
(0x0280, vec![0xca, 0x80]),
|
||||||
|
(0x0480, vec![0xd2, 0x80]),
|
||||||
|
(0x0481, vec![0xd2, 0x81]),
|
||||||
|
(0x0483, vec![0xd2, 0x83]),
|
||||||
|
(0x0487, vec![0xd2, 0x87]),
|
||||||
|
(0x048f, vec![0xd2, 0x8f]),
|
||||||
|
(0x049f, vec![0xd2, 0x9f]),
|
||||||
|
(0x04af, vec![0xd2, 0xaf]),
|
||||||
|
(0x04bf, vec![0xd2, 0xbf]),
|
||||||
|
(0x04ff, vec![0xd3, 0xbf]),
|
||||||
|
(0x05ff, vec![0xd7, 0xbf]),
|
||||||
|
(0x05ff, vec![0xd7, 0xbf]),
|
||||||
|
(0x07ff, vec![0xdf, 0xbf]),
|
||||||
|
];
|
||||||
|
for (code_point, data) in tests {
|
||||||
|
let encoded = StringDataItem {
|
||||||
|
utf16_size: Uleb128(1),
|
||||||
|
data,
|
||||||
|
};
|
||||||
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
||||||
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
||||||
|
assert_eq!(encoded, expected.as_str().into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test from https://github.com/TkTech/mutf8/tree/master, test 3 bytes
|
||||||
|
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
|
||||||
|
#[test]
|
||||||
|
fn test_tktech_3_bytes_mutf8() {
|
||||||
|
let tests = vec![
|
||||||
|
(0x0800, vec![0xe0, 0xa0, 0x80]),
|
||||||
|
(0x0801, vec![0xe0, 0xa0, 0x81]),
|
||||||
|
(0x0802, vec![0xe0, 0xa0, 0x82]),
|
||||||
|
(0x0804, vec![0xe0, 0xa0, 0x84]),
|
||||||
|
(0x0808, vec![0xe0, 0xa0, 0x88]),
|
||||||
|
(0x0810, vec![0xe0, 0xa0, 0x90]),
|
||||||
|
(0x0820, vec![0xe0, 0xa0, 0xa0]),
|
||||||
|
(0x0840, vec![0xe0, 0xa1, 0x80]),
|
||||||
|
(0x0880, vec![0xe0, 0xa2, 0x80]),
|
||||||
|
(0x0900, vec![0xe0, 0xa4, 0x80]),
|
||||||
|
(0x0a00, vec![0xe0, 0xa8, 0x80]),
|
||||||
|
(0x0c00, vec![0xe0, 0xb0, 0x80]),
|
||||||
|
(0x1800, vec![0xe1, 0xa0, 0x80]),
|
||||||
|
(0x2800, vec![0xe2, 0xa0, 0x80]),
|
||||||
|
(0x4800, vec![0xe4, 0xa0, 0x80]),
|
||||||
|
(0x8800, vec![0xe8, 0xa0, 0x80]),
|
||||||
|
(0x8801, vec![0xe8, 0xa0, 0x81]),
|
||||||
|
(0x8803, vec![0xe8, 0xa0, 0x83]),
|
||||||
|
(0x8807, vec![0xe8, 0xa0, 0x87]),
|
||||||
|
(0x880f, vec![0xe8, 0xa0, 0x8f]),
|
||||||
|
(0x881f, vec![0xe8, 0xa0, 0x9f]),
|
||||||
|
(0x883f, vec![0xe8, 0xa0, 0xbf]),
|
||||||
|
(0x887f, vec![0xe8, 0xa1, 0xbf]),
|
||||||
|
(0x88ff, vec![0xe8, 0xa3, 0xbf]),
|
||||||
|
(0x89ff, vec![0xe8, 0xa7, 0xbf]),
|
||||||
|
(0x8bff, vec![0xe8, 0xaf, 0xbf]),
|
||||||
|
(0x8fff, vec![0xe8, 0xbf, 0xbf]),
|
||||||
|
(0x9fff, vec![0xe9, 0xbf, 0xbf]),
|
||||||
|
(0xbfff, vec![0xeb, 0xbf, 0xbf]),
|
||||||
|
(0xffff, vec![0xef, 0xbf, 0xbf]),
|
||||||
|
];
|
||||||
|
for (code_point, data) in tests {
|
||||||
|
let encoded = StringDataItem {
|
||||||
|
utf16_size: Uleb128(1),
|
||||||
|
data,
|
||||||
|
};
|
||||||
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
||||||
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
||||||
|
assert_eq!(encoded, expected.as_str().into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test from https://github.com/TkTech/mutf8/tree/master, test 6 bytes
|
||||||
|
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
|
||||||
|
#[test]
|
||||||
|
fn test_tktech_6_bytes_mutf8() {
|
||||||
|
let tests = vec![
|
||||||
|
(0x10000, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x10001, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81]),
|
||||||
|
(0x10002, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82]),
|
||||||
|
(0x10004, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84]),
|
||||||
|
(0x10008, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88]),
|
||||||
|
(0x10010, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90]),
|
||||||
|
(0x10020, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0]),
|
||||||
|
(0x10040, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80]),
|
||||||
|
(0x10080, vec![0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80]),
|
||||||
|
(0x10100, vec![0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80]),
|
||||||
|
(0x10200, vec![0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80]),
|
||||||
|
(0x10400, vec![0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x10800, vec![0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x11000, vec![0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x12000, vec![0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x14000, vec![0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x18000, vec![0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x30000, vec![0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x50000, vec![0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x90000, vec![0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80]),
|
||||||
|
(0x10003, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83]),
|
||||||
|
(0x10007, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87]),
|
||||||
|
(0x1000f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f]),
|
||||||
|
(0x1001f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f]),
|
||||||
|
(0x1003f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf]),
|
||||||
|
(0x1007f, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf]),
|
||||||
|
(0x100ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf]),
|
||||||
|
(0x101ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf]),
|
||||||
|
(0x103ff, vec![0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x107ff, vec![0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x10fff, vec![0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x11fff, vec![0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x13fff, vec![0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x17fff, vec![0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x1ffff, vec![0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x3ffff, vec![0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0x7ffff, vec![0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf]),
|
||||||
|
(0xfffff, vec![0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf]),
|
||||||
|
];
|
||||||
|
for (code_point, data) in tests {
|
||||||
|
let encoded = StringDataItem {
|
||||||
|
utf16_size: Uleb128(1),
|
||||||
|
data,
|
||||||
|
};
|
||||||
|
let expected = char::from_u32(code_point).unwrap().to_string();
|
||||||
|
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
||||||
|
assert_eq!(encoded, expected.as_str().into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -78,12 +78,12 @@ pub fn derive_serializable(input: proc_macro::TokenStream) -> proc_macro::TokenS
|
||||||
let implem_size = get_implem_size(&input.data, ¶ms);
|
let implem_size = get_implem_size(&input.data, ¶ms);
|
||||||
let expanded = quote! {
|
let expanded = quote! {
|
||||||
impl androscalpel_serializer::Serializable for #name {
|
impl androscalpel_serializer::Serializable for #name {
|
||||||
#[allow(clippy::single_element_loop)]
|
#[allow(clippy::single_element_loop, clippy::let_and_return)]
|
||||||
fn serialize(&self, output: &mut dyn std::io::Write) -> androscalpel_serializer::Result<()> {
|
fn serialize(&self, output: &mut dyn std::io::Write) -> androscalpel_serializer::Result<()> {
|
||||||
#implem_serialize
|
#implem_serialize
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::single_element_loop)]
|
#[allow(clippy::single_element_loop, clippy::let_and_return)]
|
||||||
fn deserialize(input: &mut dyn androscalpel_serializer::ReadSeek) -> androscalpel_serializer::Result<Self> {
|
fn deserialize(input: &mut dyn androscalpel_serializer::ReadSeek) -> androscalpel_serializer::Result<Self> {
|
||||||
#implem_deserialize
|
#implem_deserialize
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue