finish implementing mutf8

This commit is contained in:
Jean-Marie Mineau 2023-08-24 11:22:29 +02:00
parent 24f4b0b46d
commit 49d6094d6f
Signed by: histausse
GPG key ID: B66AEEDA9B645AD2

View file

@ -92,8 +92,6 @@ impl From<&str> for StringDataItem {
} else {
panic!("Code Point {code_point} for char {chr} is to big");
}
let slice = &data[..];
println!("0x{code_point:x} -> {slice:x?}");
for b in data {
ret_data.push(b);
}
@ -128,13 +126,13 @@ impl StringDataItem {
Err(err) if i + 3 > self.data.len() => {
return Err(err);
}
_ => (),
Err(_) => (),
}
// Else, it may be a 3 bytes character...
}
let mut leading_bits = 0;
for i in 0..8 {
if (self.data[0] & (1 << (7 - i))) != 0 {
for j in 0..8 {
if (self.data[i] & (1 << (7 - j))) != 0 {
leading_bits += 1;
} else {
break;
@ -146,6 +144,7 @@ impl StringDataItem {
"data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character"
)));
}
string.push(self.get_non_surogated(i, leading_bits)?);
if leading_bits == 0 {
i += 1;
@ -228,7 +227,7 @@ impl StringDataItem {
"Found a {leading_bits} long code point at {i} but not enought bytes"
)));
}
let mut code_point = (self.data[0] % (1 << (7 - leading_bits))) as u32;
let mut code_point = (self.data[i] % (1 << (7 - leading_bits))) as u32;
let slice = &self.data[i..(i + leading_bits)];
for s in 1..leading_bits {
let j = i + s;
@ -258,30 +257,32 @@ mod test {
fn bug_tktech_mutf8() {
let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]";
let encoded: StringDataItem = string_issue_1.into();
println!("encoded:");
for c in &encoded.data {
print!("0x{c:02x} ");
}
println!();
let expected_encoded = StringDataItem {
utf16_size: Uleb128(32),
data: vec![
0x5b, 0xea, 0xb0, 0x80, 0x20, 0xeb, 0x82, 0x98, 0x20, 0xeb, 0x8b, 0xa4, 0x20, 0xeb,
0x9d, 0xbc, 0x20, 0xeb, 0xa7, 0x88, 0x20, 0xeb, 0xb0, 0x94, 0x20, 0x20, 0xec, 0x82,
0xac, 0x20, 0x20, 0xec, 0x95, 0x84, 0x20, 0xec, 0x9e, 0x90, 0x20, 0x20, 0xec, 0xb0,
0xa8, 0x20, 0xec, 0xb9, 0xb4, 0x20, 0xed, 0x83, 0x80, 0x20, 0xed, 0x8c, 0x8c, 0x20,
0xed, 0x95, 0x98, 0x5d,
],
};
assert_eq!(encoded, expected_encoded);
let decoded: String = encoded.try_into().unwrap();
assert_eq!(&decoded, string_issue_1);
/*
let string_issue_3 = "黑人抬棺組裝包";
let encoded: StringDataItem = string_issue_2.into();
let expected_encoded = StringDataItem {
utf16_size: Uleb128(7),
data: vec![
0xe9, 0xbb, 0x91, 0xe4, 0xba, 0xba, 0xe6, 0x8a, 0xac, 0xe6, 0xa3, 0xba, 0xe7, 0xb5,
0x84, 0xe8, 0xa3, 0x9d, 0xe5, 0x8c, 0x85,
],
};
let encoded: StringDataItem = string_issue_3.into();
assert_eq!(encoded, expected_encoded);
let decoded: String = encoded.try_into().unwrap();
println!("encoded:");
for c in decoded.chars() {
let val: u32 = c.into();
print!("0x{val:08x} ");
}
println!();
for c in string_issue_2.chars() {
let val: u32 = c.into();
print!("0x{val:08x} ");
}
println!();
assert_eq!(&decoded, string_issue_2);
*/
assert_eq!(&decoded, string_issue_3);
}
/// Test from https://github.com/TkTech/mutf8/tree/master, test for bad encoding
@ -396,6 +397,20 @@ mod test {
}
}
#[test]
fn test_3_bytes_mutf8() {
let tests = vec![(0xac00, vec![0xea, 0xb0, 0x80])];
for (code_point, data) in tests {
let encoded = StringDataItem {
utf16_size: Uleb128(1),
data,
};
let expected = char::from_u32(code_point).unwrap().to_string();
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
assert_eq!(encoded, expected.as_str().into());
}
}
/// Test from https://github.com/TkTech/mutf8/tree/master, test 6 bytes
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
#[test]