diff --git a/androscalpel_serializer/src/core/string.rs b/androscalpel_serializer/src/core/string.rs index a7cf187..85ebf83 100644 --- a/androscalpel_serializer/src/core/string.rs +++ b/androscalpel_serializer/src/core/string.rs @@ -92,8 +92,6 @@ impl From<&str> for StringDataItem { } else { panic!("Code Point {code_point} for char {chr} is to big"); } - let slice = &data[..]; - println!("0x{code_point:x} -> {slice:x?}"); for b in data { ret_data.push(b); } @@ -128,13 +126,13 @@ impl StringDataItem { Err(err) if i + 3 > self.data.len() => { return Err(err); } - _ => (), + Err(_) => (), } // Else, it may be a 3 bytes character... } let mut leading_bits = 0; - for i in 0..8 { - if (self.data[0] & (1 << (7 - i))) != 0 { + for j in 0..8 { + if (self.data[i] & (1 << (7 - j))) != 0 { leading_bits += 1; } else { break; @@ -146,6 +144,7 @@ impl StringDataItem { "data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character" ))); } + string.push(self.get_non_surogated(i, leading_bits)?); if leading_bits == 0 { i += 1; @@ -228,7 +227,7 @@ impl StringDataItem { "Found a {leading_bits} long code point at {i} but not enought bytes" ))); } - let mut code_point = (self.data[0] % (1 << (7 - leading_bits))) as u32; + let mut code_point = (self.data[i] % (1 << (7 - leading_bits))) as u32; let slice = &self.data[i..(i + leading_bits)]; for s in 1..leading_bits { let j = i + s; @@ -258,30 +257,32 @@ mod test { fn bug_tktech_mutf8() { let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]"; let encoded: StringDataItem = string_issue_1.into(); - println!("encoded:"); - for c in &encoded.data { - print!("0x{c:02x} "); - } - println!(); + let expected_encoded = StringDataItem { + utf16_size: Uleb128(32), + data: vec![ + 0x5b, 0xea, 0xb0, 0x80, 0x20, 0xeb, 0x82, 0x98, 0x20, 0xeb, 0x8b, 0xa4, 0x20, 0xeb, + 0x9d, 0xbc, 0x20, 0xeb, 0xa7, 0x88, 0x20, 0xeb, 0xb0, 0x94, 0x20, 0x20, 0xec, 0x82, + 0xac, 0x20, 0x20, 0xec, 0x95, 0x84, 0x20, 0xec, 0x9e, 0x90, 0x20, 0x20, 0xec, 0xb0, + 0xa8, 0x20, 0xec, 0xb9, 0xb4, 0x20, 0xed, 0x83, 0x80, 0x20, 0xed, 0x8c, 0x8c, 0x20, + 0xed, 0x95, 0x98, 0x5d, + ], + }; + assert_eq!(encoded, expected_encoded); let decoded: String = encoded.try_into().unwrap(); assert_eq!(&decoded, string_issue_1); - /* + let string_issue_3 = "黑人抬棺組裝包"; - let encoded: StringDataItem = string_issue_2.into(); + let expected_encoded = StringDataItem { + utf16_size: Uleb128(7), + data: vec![ + 0xe9, 0xbb, 0x91, 0xe4, 0xba, 0xba, 0xe6, 0x8a, 0xac, 0xe6, 0xa3, 0xba, 0xe7, 0xb5, + 0x84, 0xe8, 0xa3, 0x9d, 0xe5, 0x8c, 0x85, + ], + }; + let encoded: StringDataItem = string_issue_3.into(); + assert_eq!(encoded, expected_encoded); let decoded: String = encoded.try_into().unwrap(); - println!("encoded:"); - for c in decoded.chars() { - let val: u32 = c.into(); - print!("0x{val:08x} "); - } - println!(); - for c in string_issue_2.chars() { - let val: u32 = c.into(); - print!("0x{val:08x} "); - } - println!(); - assert_eq!(&decoded, string_issue_2); - */ + assert_eq!(&decoded, string_issue_3); } /// Test from https://github.com/TkTech/mutf8/tree/master, test for bad encoding @@ -396,6 +397,20 @@ mod test { } } + #[test] + fn test_3_bytes_mutf8() { + let tests = vec![(0xac00, vec![0xea, 0xb0, 0x80])]; + for (code_point, data) in tests { + let encoded = StringDataItem { + utf16_size: Uleb128(1), + data, + }; + let expected = char::from_u32(code_point).unwrap().to_string(); + assert_eq!(TryInto::::try_into(&encoded).unwrap(), expected); + assert_eq!(encoded, expected.as_str().into()); + } + } + /// Test from https://github.com/TkTech/mutf8/tree/master, test 6 bytes /// Test for bug found in https://github.com/TkTech/mutf8/tree/master: #[test]