Skip to content

Commit

Permalink
bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (G…
Browse files Browse the repository at this point in the history
…H-1958)

Hangul composition check boundaries are wrong for the second character
([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3)
instead of [0x11A7, 0x11C3]).
  • Loading branch information
Pusnow authored and zhangyangyu committed Jun 15, 2018
1 parent ceeef10 commit d134809
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 3 deletions.
13 changes: 13 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,19 @@ def test_issue10254(self):
b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b)

def test_issue29456(self):
# Fix #29456
u1176_str_a = '\u1100\u1176\u11a8'
u1176_str_b = '\u1100\u1176\u11a8'
u11a7_str_a = '\u1100\u1175\u11a7'
u11a7_str_b = '\uae30\u11a7'
u11c3_str_a = '\u1100\u1175\u11c3'
u11c3_str_b = '\uae30\u11c3'
self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)


def test_east_asian_width(self):
eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, b'a')
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -1800,6 +1800,7 @@ Jason Yeo
EungJun Yi
Bob Yodlowski
Danny Yoo
Wonsup Yoon
Rory Yorke
George Yoshida
Kazuhiro Yoshida
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix bugs in hangul normalization: u1176, u11a7 and u11c3
10 changes: 7 additions & 3 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
if (LBase <= code && code < (LBase+LCount) &&
i + 1 < len &&
VBase <= PyUnicode_READ(kind, data, i+1) &&
PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
and V character is a modern vowel (0x1161 ~ 0x1175). */
int LIndex, VIndex;
LIndex = code - LBase;
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
code = SBase + (LIndex*VCount+VIndex)*TCount;
i+=2;
if (i < len &&
TBase <= PyUnicode_READ(kind, data, i) &&
PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
TBase < PyUnicode_READ(kind, data, i) &&
PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
/* check T character is a modern trailing consonant
(0x11A8 ~ 0x11C2). */
code += PyUnicode_READ(kind, data, i)-TBase;
i++;
}
Expand Down

0 comments on commit d134809

Please sign in to comment.