androscalpel/androscalpel_serializer/src/core/string.rs

//! The string representation, encoded in MUTF-8
//! <https://source.android.com/docs/core/runtime/dex-format#mutf-8>
//!
//! The encoding of codepoint in MUTF-8 is as following (table from
//! <https://py2jdbc.readthedocs.io/en/latest/mutf8.html>):
//!
//! | Number of bytes | First code point | Last code point | Bits | Byte 1   | Byte 2   | Byte 3   | Byte 4   | Byte 5   | Byte 6   |
//! |-----------------|------------------|-----------------|------|----------|----------|----------|----------|----------|----------|
//! | 2               | U+0000           | U+0000          | -    | 11000000 | 10000000 |          |          |          |          |
//! | 1               | U+0001           | U+007F          | 7    | 0xxxxxxx |          |          |          |          |          |
//! | 2               | U+0080           | U+07FF          | 11   | 110xxxxx | 10xxxxxx |          |          |          |          |
//! | 3               | U+0800           | U+FFFF          | 16   | 1110xxxx | 10xxxxxx | 10xxxxxx |          |          |          |
//! | 6               | U+10000          | U+FFFFF         | 20   | 11101101 | 1010xxxx | 10xxxxxx | 11101101 | 1011xxxx | 10xxxxxx |

use std::cmp::{Ord, Ordering, PartialOrd};

use crate as androscalpel_serializer;
use crate::core::*;
pub use androscalpel_serializer_derive::*;

/// <https://source.android.com/docs/core/runtime/dex-format#string-data-item>
#[derive(Serializable, Clone, PartialEq, Eq, Debug)]
pub struct StringDataItem {
    pub utf16_size: Uleb128,
    #[until(u8, u8, 0x00u8)]
    pub data: Vec<u8>,
}

const TERMINATION_BYTE: u8 = 0;
const SURROGATE_BYTE: u8 = 0b1110_1101;
const MASK_SURROGATED_BYTE_PREFIX: u8 = 0b1111_0000;
const VALUE_SURROGATED_BYTE_1_PREFIX: u8 = 0b1010_0000;
const VALUE_SURROGATED_BYTE_2_PREFIX: u8 = 0b1011_0000;
const MASK_TRAYLING_BYTE_PREFIX: u8 = 0b1100_0000;
const VALUE_TRAYLING_BYTE_PREFIX: u8 = 0b1000_0000;

impl Ord for StringDataItem {
    fn cmp(&self, other: &Self) -> Ordering {
        self.get_aosp_utf16()
            .unwrap()
            .cmp(&other.get_aosp_utf16().unwrap())
            .then(self.utf16_size.cmp(&other.utf16_size))
    }
}

impl PartialOrd for StringDataItem {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl TryFrom<&StringDataItem> for String {
    type Error = Error;
    fn try_from(item: &StringDataItem) -> Result<String> {
        item.get_string()
    }
}
impl TryFrom<StringDataItem> for String {
    type Error = Error;
    fn try_from(item: StringDataItem) -> Result<String> {
        item.get_string()
    }
}

impl From<&str> for StringDataItem {
    fn from(string: &str) -> Self {
        let mut ret_data = vec![];
        let mut data = vec![];
        let mut size = 0;
        for chr in string.chars() {
            let code_point: u32 = chr.into();
            if code_point == 0 {
                data.push(0b1100_0000);
                data.push(0b1000_0000);
            } else if code_point <= 0x7F {
                data.push(code_point as u8);
            } else if code_point <= 0x7FF {
                data.push(0b1100_0000 | (((code_point >> 6) & 0b0001_1111) as u8));
                data.push(
                    VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
                );
            } else if code_point <= 0xFFFF {
                data.push(0b1110_0000 | (((code_point >> 12) & 0b0000_1111) as u8));
                data.push(
                    VALUE_TRAYLING_BYTE_PREFIX
                        | (((code_point >> 6) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
                );
                data.push(
                    VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
                );
            } else if code_point <= 0xFFFFF {
                data.push(SURROGATE_BYTE);
                data.push(
                    VALUE_SURROGATED_BYTE_1_PREFIX
                        | (((code_point >> 16) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
                );
                data.push(
                    VALUE_TRAYLING_BYTE_PREFIX
                        | (((code_point >> 10) & !(MASK_TRAYLING_BYTE_PREFIX as u32)) as u8),
                );
                data.push(SURROGATE_BYTE);
                data.push(
                    VALUE_SURROGATED_BYTE_2_PREFIX
                        | (((code_point >> 6) & !(MASK_SURROGATED_BYTE_PREFIX as u32)) as u8),
                );
                data.push(
                    VALUE_TRAYLING_BYTE_PREFIX | (code_point as u8 & !MASK_TRAYLING_BYTE_PREFIX),
                );
            } else {
                panic!("Code Point {code_point} for char {chr} is to big");
            }
            for b in data {
                ret_data.push(b);
            }
            data = vec![];
            size += 1;
        }
        Self {
            utf16_size: Uleb128(size),
            data: ret_data,
        }
    }
}

impl StringDataItem {
    /// Return the utf-16 string used by google in the aosp for comparaison & other.
    fn get_aosp_utf16(&self) -> Result<Vec<u32>> {
        let mut utf16_string = vec![];
        let mut i = 0;
        while i < self.data.len() {
            let one = self.data[i] as u32;
            i += 1;
            if one & 0x80 == 0 {
                utf16_string.push(one);
                continue;
            }

            if i >= self.data.len() {
                return Err(Error::InvalidStringEncoding(
                    "String contains invalid caracters".into(),
                ));
            }
            let two = self.data[i] as u32;
            i += 1;
            if one & 0x20 == 0 {
                utf16_string.push((one & 0x1f) << 6 | (two & 0x3f));
                continue;
            }

            if i >= self.data.len() {
                return Err(Error::InvalidStringEncoding(
                    "String contains invalid caracters".into(),
                ));
            }
            let three = self.data[i] as u32;
            i += 1;
            if one & 0x10 == 0 {
                utf16_string.push((one & 0x0f) << 12 | (two & 0x3f) << 6 | (three & 0x3f));
                continue;
            }

            if i >= self.data.len() {
                return Err(Error::InvalidStringEncoding(
                    "String contains invalid caracters".into(),
                ));
            }
            let four = self.data[i] as u32;
            i += 1;
            let code_point =
                (one & 0x0f) << 18 | (two & 0x3f) << 12 | (three & 0x3f) << 6 | (four & 0x3f);
            let mut pair = ((code_point >> 10) + 0xd7c0) & 0xffff;
            pair |= ((code_point & 0x03ff) + 0xdc00) << 16;
            utf16_string.push(pair);
        }
        Ok(utf16_string)
    }
    fn get_string(&self) -> Result<String> {
        let mut string = String::new();
        let mut i = 0;
        while i < self.data.len() {
            if self.data[i] == TERMINATION_BYTE {
                return Err(Error::InvalidStringEncoding(
                    "String should not contains null bytes".into(),
                ));
            }
            if self.data[i] == SURROGATE_BYTE {
                let res = self.get_surrogate(i);
                match res {
                    Ok(chr) => {
                        string.push(chr);
                        i += 6;
                        continue;
                    }
                    Err(err) if i + 3 > self.data.len() => {
                        return Err(err);
                    }
                    Err(_) => (),
                }
                // Else, it may be a 3 bytes character...
            }
            let mut leading_bits = 0;
            for j in 0..8 {
                if (self.data[i] & (1 << (7 - j))) != 0 {
                    leading_bits += 1;
                } else {
                    break;
                }
            }
            if leading_bits == 1 || leading_bits > 3 {
                let byte = self.data[i];
                return Err(Error::InvalidStringEncoding(format!(
                    "data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character"
                )));
            }

            string.push(self.get_non_surogated(i, leading_bits)?);
            if leading_bits == 0 {
                i += 1;
            } else {
                i += leading_bits;
            }
        }
        Ok(string)
    }

    fn get_surrogate(&self, i: usize) -> Result<char> {
        if i + 6 > self.data.len() {
            return Err(Error::InvalidStringEncoding(
                "Found surogate byte, but not enought bytes left to form a surogate pair".into(),
            ));
        }
        if self.data[i] != SURROGATE_BYTE {
            let byte = self.data[i];
            return Err(Error::InvalidStringEncoding(format!(
                "data[{i}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b1)"
            )));
        }
        if self.data[i + 3] != SURROGATE_BYTE {
            let j = i + 3;
            let byte = self.data[j];
            return Err(Error::InvalidStringEncoding(format!(
                "data[{j}]: 0x{byte:02x} != 0x{SURROGATE_BYTE:02x} (Surrogate byte, b4)"
            )));
        }
        if (self.data[i + 1] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_1_PREFIX {
            let j = i + 1;
            let byte = self.data[j];
            return Err(Error::InvalidStringEncoding(format!(
                "data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_1_PREFIX:02x} (First surogate byte prefix, b2)"
            )));
        }
        if (self.data[i + 4] & MASK_SURROGATED_BYTE_PREFIX) != VALUE_SURROGATED_BYTE_2_PREFIX {
            let j = i + 4;
            let byte = self.data[j];
            return Err(Error::InvalidStringEncoding(format!(
                "data[{j}]: 0x{byte:02x} & 0x{MASK_SURROGATED_BYTE_PREFIX:02x} != 0x{VALUE_SURROGATED_BYTE_2_PREFIX:02x} (Second surogate byte prefix, b5)"
            )));
        }
        if (self.data[i + 2] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
            let j = i + 2;
            let byte = self.data[j];
            return Err(Error::InvalidStringEncoding(format!(
                "data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailing byte prefix, b3)"
            )));
        }
        if (self.data[i + 5] & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
            let j = i + 5;
            let byte = self.data[j];
            return Err(Error::InvalidStringEncoding(format!(
                "data[{j}]: 0x{byte:02x} & 0x{MASK_TRAYLING_BYTE_PREFIX:02x} != 0x{VALUE_TRAYLING_BYTE_PREFIX:02x} (Surogate trailling byte prefix, b6)"
            )));
        }

        let mut surogated_hight = ((self.data[i + 1] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
        surogated_hight |= (self.data[i + 2] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
        let mut surogated_low = ((self.data[i + 4] & !MASK_SURROGATED_BYTE_PREFIX) as u32) << 6;
        surogated_low |= (self.data[i + 5] & !MASK_TRAYLING_BYTE_PREFIX) as u32;
        let code_point = 0x10000 | (surogated_hight << 10) | (surogated_low);

        let slice = &self.data[i..(i + 6)];
        char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
            "Invalide unicode code point in surrogated pair: {slice:02x?}: {code_point}"
        )))
    }

    fn get_non_surogated(&self, i: usize, leading_bits: usize) -> Result<char> {
        if leading_bits == 0 {
            let byte = self.data[i];
            char::from_u32(byte as u32).ok_or(Error::InvalidStringEncoding(format!(
                "Invalide unicode code point: {byte}"
            )))
        } else {
            if i + leading_bits > self.data.len() {
                return Err(Error::InvalidStringEncoding(format!(
                    "Found a {leading_bits} long code point at {i} but not enought bytes"
                )));
            }
            let mut code_point = (self.data[i] % (1 << (7 - leading_bits))) as u32;
            let slice = &self.data[i..(i + leading_bits)];
            for s in 1..leading_bits {
                let j = i + s;
                let byte = self.data[j];
                if (byte & MASK_TRAYLING_BYTE_PREFIX) != VALUE_TRAYLING_BYTE_PREFIX {
                    return Err(Error::InvalidStringEncoding(format!(
                        "Invalid byte {byte:02x} at {j} in {leading_bits} bytes long code point {slice:02x?}"
                    )));
                }
                code_point <<= 6;
                code_point |= (byte & !MASK_TRAYLING_BYTE_PREFIX) as u32;
            }
            char::from_u32(code_point).ok_or(Error::InvalidStringEncoding(format!(
                "Invalide unicode code point: {code_point} ({slice:02x?})"
            )))
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    /// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
    /// <https://github.com/TkTech/mutf8/blob/master/tests/test_bugs.py>
    #[test]
    fn bug_tktech_mutf8() {
        let string_issue_1 = "[가 나 다 라 마 바  사  아 자  차 카 타 파 하]";
        let encoded: StringDataItem = string_issue_1.into();
        let expected_encoded = StringDataItem {
            utf16_size: Uleb128(32),
            data: vec![
                0x5b, 0xea, 0xb0, 0x80, 0x20, 0xeb, 0x82, 0x98, 0x20, 0xeb, 0x8b, 0xa4, 0x20, 0xeb,
                0x9d, 0xbc, 0x20, 0xeb, 0xa7, 0x88, 0x20, 0xeb, 0xb0, 0x94, 0x20, 0x20, 0xec, 0x82,
                0xac, 0x20, 0x20, 0xec, 0x95, 0x84, 0x20, 0xec, 0x9e, 0x90, 0x20, 0x20, 0xec, 0xb0,
                0xa8, 0x20, 0xec, 0xb9, 0xb4, 0x20, 0xed, 0x83, 0x80, 0x20, 0xed, 0x8c, 0x8c, 0x20,
                0xed, 0x95, 0x98, 0x5d,
            ],
        };
        assert_eq!(encoded, expected_encoded);
        let decoded: String = encoded.try_into().unwrap();
        assert_eq!(&decoded, string_issue_1);

        let string_issue_3 = "黑人抬棺組裝包";
        let expected_encoded = StringDataItem {
            utf16_size: Uleb128(7),
            data: vec![
                0xe9, 0xbb, 0x91, 0xe4, 0xba, 0xba, 0xe6, 0x8a, 0xac, 0xe6, 0xa3, 0xba, 0xe7, 0xb5,
                0x84, 0xe8, 0xa3, 0x9d, 0xe5, 0x8c, 0x85,
            ],
        };
        let encoded: StringDataItem = string_issue_3.into();
        assert_eq!(encoded, expected_encoded);
        let decoded: String = encoded.try_into().unwrap();
        assert_eq!(&decoded, string_issue_3);
    }

    /// Test from <https://github.com/TkTech/mutf8/tree/master>, test for bad encoding
    /// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
    #[test]
    fn test_tktech_bad_mutf8() {
        assert!(TryInto::<String>::try_into(StringDataItem {
            utf16_size: Uleb128(0),
            data: vec![0x00]
        })
        .is_err());
        assert!(TryInto::<String>::try_into(StringDataItem {
            utf16_size: Uleb128(0),
            data: vec![0xC2]
        })
        .is_err());
        assert!(TryInto::<String>::try_into(StringDataItem {
            utf16_size: Uleb128(0),
            data: vec![0xED]
        })
        .is_err());
        assert!(TryInto::<String>::try_into(StringDataItem {
            utf16_size: Uleb128(0),
            data: vec![0xE2]
        })
        .is_err());
    }

    /// Test from <https://github.com/TkTech/mutf8/tree/master>, test 2 bytes
    /// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
    #[test]
    fn test_tktech_2_bytes_mutf8() {
        let tests = vec![
            (0x0080, vec![0xc2, 0x80]),
            (0x0081, vec![0xc2, 0x81]),
            (0x0082, vec![0xc2, 0x82]),
            (0x0084, vec![0xc2, 0x84]),
            (0x0088, vec![0xc2, 0x88]),
            (0x0090, vec![0xc2, 0x90]),
            (0x00a0, vec![0xc2, 0xa0]),
            (0x00c0, vec![0xc3, 0x80]),
            (0x0180, vec![0xc6, 0x80]),
            (0x0280, vec![0xca, 0x80]),
            (0x0480, vec![0xd2, 0x80]),
            (0x0481, vec![0xd2, 0x81]),
            (0x0483, vec![0xd2, 0x83]),
            (0x0487, vec![0xd2, 0x87]),
            (0x048f, vec![0xd2, 0x8f]),
            (0x049f, vec![0xd2, 0x9f]),
            (0x04af, vec![0xd2, 0xaf]),
            (0x04bf, vec![0xd2, 0xbf]),
            (0x04ff, vec![0xd3, 0xbf]),
            (0x05ff, vec![0xd7, 0xbf]),
            (0x05ff, vec![0xd7, 0xbf]),
            (0x07ff, vec![0xdf, 0xbf]),
        ];
        for (code_point, data) in tests {
            let encoded = StringDataItem {
                utf16_size: Uleb128(1),
                data,
            };
            let expected = char::from_u32(code_point).unwrap().to_string();
            assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
            assert_eq!(encoded, expected.as_str().into());
        }
    }

    /// Test from <https://github.com/TkTech/mutf8/tree/master>, test 3 bytes
    /// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
    #[test]
    fn test_tktech_3_bytes_mutf8() {
        let tests = vec![
            (0x0800, vec![0xe0, 0xa0, 0x80]),
            (0x0801, vec![0xe0, 0xa0, 0x81]),
            (0x0802, vec![0xe0, 0xa0, 0x82]),
            (0x0804, vec![0xe0, 0xa0, 0x84]),
            (0x0808, vec![0xe0, 0xa0, 0x88]),
            (0x0810, vec![0xe0, 0xa0, 0x90]),
            (0x0820, vec![0xe0, 0xa0, 0xa0]),
            (0x0840, vec![0xe0, 0xa1, 0x80]),
            (0x0880, vec![0xe0, 0xa2, 0x80]),
            (0x0900, vec![0xe0, 0xa4, 0x80]),
            (0x0a00, vec![0xe0, 0xa8, 0x80]),
            (0x0c00, vec![0xe0, 0xb0, 0x80]),
            (0x1800, vec![0xe1, 0xa0, 0x80]),
            (0x2800, vec![0xe2, 0xa0, 0x80]),
            (0x4800, vec![0xe4, 0xa0, 0x80]),
            (0x8800, vec![0xe8, 0xa0, 0x80]),
            (0x8801, vec![0xe8, 0xa0, 0x81]),
            (0x8803, vec![0xe8, 0xa0, 0x83]),
            (0x8807, vec![0xe8, 0xa0, 0x87]),
            (0x880f, vec![0xe8, 0xa0, 0x8f]),
            (0x881f, vec![0xe8, 0xa0, 0x9f]),
            (0x883f, vec![0xe8, 0xa0, 0xbf]),
            (0x887f, vec![0xe8, 0xa1, 0xbf]),
            (0x88ff, vec![0xe8, 0xa3, 0xbf]),
            (0x89ff, vec![0xe8, 0xa7, 0xbf]),
            (0x8bff, vec![0xe8, 0xaf, 0xbf]),
            (0x8fff, vec![0xe8, 0xbf, 0xbf]),
            (0x9fff, vec![0xe9, 0xbf, 0xbf]),
            (0xbfff, vec![0xeb, 0xbf, 0xbf]),
            (0xffff, vec![0xef, 0xbf, 0xbf]),
        ];
        for (code_point, data) in tests {
            let encoded = StringDataItem {
                utf16_size: Uleb128(1),
                data,
            };
            let expected = char::from_u32(code_point).unwrap().to_string();
            assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
            assert_eq!(encoded, expected.as_str().into());
        }
    }

    #[test]
    fn test_3_bytes_mutf8() {
        let tests = vec![(0xac00, vec![0xea, 0xb0, 0x80])];
        for (code_point, data) in tests {
            let encoded = StringDataItem {
                utf16_size: Uleb128(1),
                data,
            };
            let expected = char::from_u32(code_point).unwrap().to_string();
            assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
            assert_eq!(encoded, expected.as_str().into());
        }
    }

    /// Test from <https://github.com/TkTech/mutf8/tree/master>, test 6 bytes
    /// Test for bug found in <https://github.com/TkTech/mutf8/tree/master>:
    #[test]
    fn test_tktech_6_bytes_mutf8() {
        let tests = vec![
            (0x10000, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80]),
            (0x10001, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81]),
            (0x10002, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82]),
            (0x10004, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84]),
            (0x10008, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88]),
            (0x10010, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90]),
            (0x10020, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0]),
            (0x10040, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80]),
            (0x10080, vec![0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80]),
            (0x10100, vec![0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80]),
            (0x10200, vec![0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80]),
            (0x10400, vec![0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80]),
            (0x10800, vec![0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80]),
            (0x11000, vec![0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80]),
            (0x12000, vec![0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80]),
            (0x14000, vec![0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80]),
            (0x18000, vec![0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80]),
            (0x30000, vec![0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80]),
            (0x50000, vec![0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80]),
            (0x90000, vec![0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80]),
            (0x10003, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83]),
            (0x10007, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87]),
            (0x1000f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f]),
            (0x1001f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f]),
            (0x1003f, vec![0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf]),
            (0x1007f, vec![0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf]),
            (0x100ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf]),
            (0x101ff, vec![0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf]),
            (0x103ff, vec![0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf]),
            (0x107ff, vec![0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf]),
            (0x10fff, vec![0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf]),
            (0x11fff, vec![0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf]),
            (0x13fff, vec![0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf]),
            (0x17fff, vec![0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf]),
            (0x1ffff, vec![0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf]),
            (0x3ffff, vec![0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf]),
            (0x7ffff, vec![0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf]),
            (0xfffff, vec![0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf]),
        ];
        for (code_point, data) in tests {
            let encoded = StringDataItem {
                utf16_size: Uleb128(1),
                data,
            };
            let expected = char::from_u32(code_point).unwrap().to_string();
            assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
            assert_eq!(encoded, expected.as_str().into());
        }
    }

    /// Apparently I don't know how to code an order relation so here it is...
    #[test]
    fn test_ord_relation() {
        let s1: StringDataItem = "Landroidx/lifecycle/WithLifecycleStateKt$suspendWithStateAtLeastUnchecked$2$observer$1;".into();
        let s2: StringDataItem = "Lcom/google/android/material/search/SearchBarAnimationHelper$$ExternalSyntheticLambda4;".into();
        let s1_utf16 = s1.get_aosp_utf16().unwrap();
        let s2_utf16 = s2.get_aosp_utf16().unwrap();
        assert_eq!(s1_utf16 < s2_utf16, true);
        assert_eq!(s1_utf16 == s2_utf16, false);
        assert_eq!(s1_utf16 > s2_utf16, false);
        assert_eq!(s2_utf16 < s1_utf16, false);
        assert_eq!(s2_utf16 == s1_utf16, false);
        assert_eq!(s2_utf16 > s1_utf16, true);
        assert_eq!(s1 < s2, true);
        assert_eq!(s1 == s2, false);
        assert_eq!(s1 > s2, false);
        assert_eq!(s2 < s1, false);
        assert_eq!(s2 == s1, false);
        assert_eq!(s2 > s1, true);
    }
}