finish implementing mutf8
This commit is contained in:
parent
24f4b0b46d
commit
49d6094d6f
1 changed files with 41 additions and 26 deletions
|
|
@ -92,8 +92,6 @@ impl From<&str> for StringDataItem {
|
|||
} else {
|
||||
panic!("Code Point {code_point} for char {chr} is to big");
|
||||
}
|
||||
let slice = &data[..];
|
||||
println!("0x{code_point:x} -> {slice:x?}");
|
||||
for b in data {
|
||||
ret_data.push(b);
|
||||
}
|
||||
|
|
@ -128,13 +126,13 @@ impl StringDataItem {
|
|||
Err(err) if i + 3 > self.data.len() => {
|
||||
return Err(err);
|
||||
}
|
||||
_ => (),
|
||||
Err(_) => (),
|
||||
}
|
||||
// Else, it may be a 3 bytes character...
|
||||
}
|
||||
let mut leading_bits = 0;
|
||||
for i in 0..8 {
|
||||
if (self.data[0] & (1 << (7 - i))) != 0 {
|
||||
for j in 0..8 {
|
||||
if (self.data[i] & (1 << (7 - j))) != 0 {
|
||||
leading_bits += 1;
|
||||
} else {
|
||||
break;
|
||||
|
|
@ -146,6 +144,7 @@ impl StringDataItem {
|
|||
"data[{i}]: 0x{byte:02x} is an invalid MUTF-8 character"
|
||||
)));
|
||||
}
|
||||
|
||||
string.push(self.get_non_surogated(i, leading_bits)?);
|
||||
if leading_bits == 0 {
|
||||
i += 1;
|
||||
|
|
@ -228,7 +227,7 @@ impl StringDataItem {
|
|||
"Found a {leading_bits} long code point at {i} but not enought bytes"
|
||||
)));
|
||||
}
|
||||
let mut code_point = (self.data[0] % (1 << (7 - leading_bits))) as u32;
|
||||
let mut code_point = (self.data[i] % (1 << (7 - leading_bits))) as u32;
|
||||
let slice = &self.data[i..(i + leading_bits)];
|
||||
for s in 1..leading_bits {
|
||||
let j = i + s;
|
||||
|
|
@ -258,30 +257,32 @@ mod test {
|
|||
fn bug_tktech_mutf8() {
|
||||
let string_issue_1 = "[가 나 다 라 마 바 사 아 자 차 카 타 파 하]";
|
||||
let encoded: StringDataItem = string_issue_1.into();
|
||||
println!("encoded:");
|
||||
for c in &encoded.data {
|
||||
print!("0x{c:02x} ");
|
||||
}
|
||||
println!();
|
||||
let expected_encoded = StringDataItem {
|
||||
utf16_size: Uleb128(32),
|
||||
data: vec![
|
||||
0x5b, 0xea, 0xb0, 0x80, 0x20, 0xeb, 0x82, 0x98, 0x20, 0xeb, 0x8b, 0xa4, 0x20, 0xeb,
|
||||
0x9d, 0xbc, 0x20, 0xeb, 0xa7, 0x88, 0x20, 0xeb, 0xb0, 0x94, 0x20, 0x20, 0xec, 0x82,
|
||||
0xac, 0x20, 0x20, 0xec, 0x95, 0x84, 0x20, 0xec, 0x9e, 0x90, 0x20, 0x20, 0xec, 0xb0,
|
||||
0xa8, 0x20, 0xec, 0xb9, 0xb4, 0x20, 0xed, 0x83, 0x80, 0x20, 0xed, 0x8c, 0x8c, 0x20,
|
||||
0xed, 0x95, 0x98, 0x5d,
|
||||
],
|
||||
};
|
||||
assert_eq!(encoded, expected_encoded);
|
||||
let decoded: String = encoded.try_into().unwrap();
|
||||
assert_eq!(&decoded, string_issue_1);
|
||||
/*
|
||||
|
||||
let string_issue_3 = "黑人抬棺組裝包";
|
||||
let encoded: StringDataItem = string_issue_2.into();
|
||||
let expected_encoded = StringDataItem {
|
||||
utf16_size: Uleb128(7),
|
||||
data: vec![
|
||||
0xe9, 0xbb, 0x91, 0xe4, 0xba, 0xba, 0xe6, 0x8a, 0xac, 0xe6, 0xa3, 0xba, 0xe7, 0xb5,
|
||||
0x84, 0xe8, 0xa3, 0x9d, 0xe5, 0x8c, 0x85,
|
||||
],
|
||||
};
|
||||
let encoded: StringDataItem = string_issue_3.into();
|
||||
assert_eq!(encoded, expected_encoded);
|
||||
let decoded: String = encoded.try_into().unwrap();
|
||||
println!("encoded:");
|
||||
for c in decoded.chars() {
|
||||
let val: u32 = c.into();
|
||||
print!("0x{val:08x} ");
|
||||
}
|
||||
println!();
|
||||
for c in string_issue_2.chars() {
|
||||
let val: u32 = c.into();
|
||||
print!("0x{val:08x} ");
|
||||
}
|
||||
println!();
|
||||
assert_eq!(&decoded, string_issue_2);
|
||||
*/
|
||||
assert_eq!(&decoded, string_issue_3);
|
||||
}
|
||||
|
||||
/// Test from https://github.com/TkTech/mutf8/tree/master, test for bad encoding
|
||||
|
|
@ -396,6 +397,20 @@ mod test {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_3_bytes_mutf8() {
|
||||
let tests = vec![(0xac00, vec![0xea, 0xb0, 0x80])];
|
||||
for (code_point, data) in tests {
|
||||
let encoded = StringDataItem {
|
||||
utf16_size: Uleb128(1),
|
||||
data,
|
||||
};
|
||||
let expected = char::from_u32(code_point).unwrap().to_string();
|
||||
assert_eq!(TryInto::<String>::try_into(&encoded).unwrap(), expected);
|
||||
assert_eq!(encoded, expected.as_str().into());
|
||||
}
|
||||
}
|
||||
|
||||
/// Test from https://github.com/TkTech/mutf8/tree/master, test 6 bytes
|
||||
/// Test for bug found in https://github.com/TkTech/mutf8/tree/master:
|
||||
#[test]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue