From 431abbaa89d737c22097fa1c967775e4b81f217c Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 15:28:11 +0200 Subject: [PATCH 01/12] gh-76535: Make `PyUnicode_ToLowerFull` and friends public Make `PyUnicode_ToLowerFull`, `PyUnicode_ToUpperFull` and `PyUnicode_ToTitleFull` public and rename them to `PyUnicode_ToLower` etc. --- Doc/c-api/unicode.rst | 30 ++++++++++++++++++ Include/cpython/unicodeobject.h | 15 +++++++++ Include/internal/pycore_unicodeobject.h | 3 -- Objects/unicodectype.c | 42 +++++++++++++++++-------- Objects/unicodeobject.c | 10 +++--- 5 files changed, 79 insertions(+), 21 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 84fee05cb4ce20..90e6a382ea3078 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,36 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 86c502730f478c..d52d86105e7d84 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,21 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(int) PyUnicode_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + +PyAPI_FUNC(int) PyUnicode_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + +PyAPI_FUNC(int) PyUnicode_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 3791b913c17546..cc1368fb63d4ae 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,9 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..9f10c02f67fd1a 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -206,15 +206,21 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->lower; + + if (res != NULL) { + res[0] = ch + ctype->lower; + } return 1; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -222,15 +228,20 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->title; + if (res != NULL) { + res[0] = ch + ctype->title; + } return 1; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -238,11 +249,16 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->upper; + if (res != NULL) { + res[0] = ch + ctype->upper; + } return 1; } @@ -258,7 +274,7 @@ int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } - return _PyUnicode_ToLowerFull(ch, res); + return PyUnicode_ToLowerFull(ch, res); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..6c9c3ccfca4dea 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10046,7 +10046,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUnicode_ToLower(c, mapped); } static Py_ssize_t @@ -10057,7 +10057,7 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10084,7 +10084,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 n_res = lower_ucs4(kind, data, length, i, c, mapped); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped); } else { n_res = 1; @@ -10110,7 +10110,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10163,7 +10163,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped); else - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From d604fc8ed0edfcc06c20da42489cd130523eea1b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 16:52:21 +0200 Subject: [PATCH 02/12] Address feedback; add size parameter and do PyUnicode_ToFolded as well --- Doc/c-api/unicode.rst | 43 +++++++++++++++-------- Include/cpython/unicodeobject.h | 16 +++++++-- Include/internal/pycore_unicodeobject.h | 1 - Objects/unicodectype.c | 45 +++++++++++++++++++++---- Objects/unicodeobject.c | 22 ++++++------ 5 files changed, 91 insertions(+), 36 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 90e6a382ea3078..1e3d0c0b1ec1ff 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,36 +307,51 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + able to hold as many characters needed for *ch* to be lower cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index d52d86105e7d84..f9142af0057b78 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -735,19 +735,29 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( PyAPI_FUNC(int) PyUnicode_ToLower( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); PyAPI_FUNC(int) PyUnicode_ToUpper( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); PyAPI_FUNC(int) PyUnicode_ToTitle( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); +PyAPI_FUNC(int) PyUnicode_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index cc1368fb63d4ae..c83a221bb6a3a1 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,7 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 9f10c02f67fd1a..2ef667c30a1690 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -208,6 +208,10 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } @@ -215,12 +219,16 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->lower; } return 1; } -int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } return n; } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->title; } return 1; } -int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -251,18 +267,26 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } return n; } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->upper; } return 1; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -270,11 +294,18 @@ int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - return PyUnicode_ToLowerFull(ch, res); + return PyUnicode_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6c9c3ccfca4dea..4a3b77b727657a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10039,14 +10039,14 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i static int lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, int mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return PyUnicode_ToLower(c, mapped); + return PyUnicode_ToLower(c, mapped, mapped_size); } static Py_ssize_t @@ -10057,14 +10057,14 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10081,10 +10081,10 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, 3); } else { n_res = 1; @@ -10108,9 +10108,9 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); else - n_res = PyUnicode_ToUpper(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10139,7 +10139,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + int j, n_res = PyUnicode_ToFolded(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10161,9 +10161,9 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m int n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); else - n_res = PyUnicode_ToTitle(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From fbbf8412011cb56092d2f90300f00b8096a725fe Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:56:46 +0000 Subject: [PATCH 03/12] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..65b5c45a33a895 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUnicode_ToLower`, :c:func:`PyUnicode_ToUpper`, :c:func:`PyUnicode_ToTitle` and :c:func:`PyUnicode_ToFolded` public. From f17aa0c55e0014fc2f1e19aa041accdaf755a051 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 17:29:35 +0200 Subject: [PATCH 04/12] Address more feedback; assert return value and raise ValueError --- Doc/c-api/unicode.rst | 8 ++++---- Objects/unicodectype.c | 14 +++++++------- Objects/unicodeobject.c | 25 +++++++++++++++---------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 1e3d0c0b1ec1ff..879b76b770f8fa 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -313,7 +313,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be lower cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -324,7 +324,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be upper cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -335,7 +335,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be title cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -346,7 +346,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be foldcased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 2ef667c30a1690..66a7d9d85e67cd 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -209,7 +209,7 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -220,7 +220,7 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->lower; @@ -239,7 +239,7 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -249,7 +249,7 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) } if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->title; @@ -268,7 +268,7 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -278,7 +278,7 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) } if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->upper; @@ -297,7 +297,7 @@ int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4a3b77b727657a..05e9dbf7d3fa51 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10057,14 +10057,16 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped, 3); + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10081,15 +10083,16 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped, 3); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10108,9 +10111,10 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToUpper(c, mapped, 3); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10139,7 +10143,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = PyUnicode_ToFolded(c, mapped, 3); + int j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10161,10 +10166,10 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m int n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToTitle(c, mapped, 3); - + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; From 4a704898102c0bc94d1490732cc6fa70c0f01515 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 18:32:04 +0200 Subject: [PATCH 05/12] Add tests --- Lib/test/test_capi/test_unicode.py | 49 +++++++++++++ Modules/_testcapi/unicode.c | 110 +++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..2f9a2e0b8b5b51 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,55 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..01c4caef6e2a01 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,112 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_tolower only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 lower[3]; + int chars = PyUnicode_ToLower(c, lower, Py_ARRAY_LENGTH(lower)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, lower, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + +/* Test PyUnicode_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_toupper only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 upper[3]; + int chars = PyUnicode_ToUpper(c, upper, Py_ARRAY_LENGTH(upper)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, upper, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_totitle only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 title[3]; + int chars = PyUnicode_ToTitle(c, title, Py_ARRAY_LENGTH(title)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, title, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_tofolded only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 folded[3]; + int chars = PyUnicode_ToFolded(c, folded, Py_ARRAY_LENGTH(folded)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, folded, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +683,10 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; From 61afd9a5f7a9906b5f38175833ada9a7b1993a0d Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 18:38:41 +0200 Subject: [PATCH 06/12] Document the maximum numbers of characters needed in the buffer --- Doc/c-api/unicode.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 879b76b770f8fa..9021b6142ed199 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -310,7 +310,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased, and + able to hold as many characters needed for *ch* to be lower cased + (e.g. a maximum of two character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -321,7 +322,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to upper case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be upper cased, and + able to hold as many characters needed for *ch* to be upper cased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -332,7 +334,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to title case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be title cased, and + able to hold as many characters needed for *ch* to be title cased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -343,7 +346,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) Foldcase *ch*, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be foldcased, and + able to hold as many characters needed for *ch* to be foldcased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. From 7885b17116451e3bc5e59e6fe281db0be573e195 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 13:56:02 +0200 Subject: [PATCH 07/12] Address feedback; test more characters and refactor _testcapi functions --- Lib/test/test_capi/test_unicode.py | 6 ++ Modules/_testcapi/unicode.c | 90 +++++++----------------------- 2 files changed, 27 insertions(+), 69 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 2f9a2e0b8b5b51..931ce47ed2911e 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1765,6 +1765,7 @@ def test_tolower(self): # Test unicode character self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1778,6 +1779,8 @@ def test_toupper(self): # Test unicode character self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1788,6 +1791,8 @@ def test_totitle(self): # Test unicode character self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1798,6 +1803,7 @@ def test_tofolded(self): # Test unicode character self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") # Test case-ignorable character self.assertEqual(unicode_tofolded("👍"), "👍") diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 01c4caef6e2a01..9959a7c613da48 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,56 +220,46 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } -/* Test PyUnicode_ToLower() */ static PyObject * -unicode_tolower(PyObject *self, PyObject *arg) +unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), const char *name) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_tolower only accepts 1-character strings"); + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_Format(PyExc_ValueError, "%s only accepts 1-character strings", name); return NULL; } - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); - Py_UCS4 lower[3]; - int chars = PyUnicode_ToLower(c, lower, Py_ARRAY_LENGTH(lower)); - assert(chars >= 1); + Py_UCS4 buf[3]; + int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + if (chars <= 0) { + PyErr_BadInternalCall(); + return NULL; + } PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); if (writer == NULL) { return NULL; } - if (PyUnicodeWriter_WriteUCS4(writer, lower, chars) < 0) { + if (PyUnicodeWriter_WriteUCS4(writer, buf, chars) < 0) { PyUnicodeWriter_Discard(writer); return NULL; } return PyUnicodeWriter_Finish(writer); } +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToLower, "unicode_tolower"); +} + /* Test PyUnicode_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_toupper only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 upper[3]; - int chars = PyUnicode_ToUpper(c, upper, Py_ARRAY_LENGTH(upper)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, upper, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToUpper, "unicode_toupper"); } @@ -277,52 +267,14 @@ unicode_toupper(PyObject *self, PyObject *arg) static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_totitle only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 title[3]; - int chars = PyUnicode_ToTitle(c, title, Py_ARRAY_LENGTH(title)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, title, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToTitle, "unicode_totitle"); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_tofolded only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 folded[3]; - int chars = PyUnicode_ToFolded(c, folded, Py_ARRAY_LENGTH(folded)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, folded, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToFolded, "unicode_tofolded"); } From 6f9cb9572009d1cd7e55e229b15992490e146a19 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:14:03 +0200 Subject: [PATCH 08/12] Address more review comments --- Modules/_testcapi/unicode.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 9959a7c613da48..057bc3b7a6f1c9 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,10 +221,15 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), const char *name) +unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) { + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + if (PyUnicode_GET_LENGTH(str) != 1) { - PyErr_Format(PyExc_ValueError, "%s only accepts 1-character strings", name); + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); return NULL; } @@ -233,33 +238,24 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), Py_UCS4 buf[3]; int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); if (chars <= 0) { - PyErr_BadInternalCall(); return NULL; } - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, buf, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tolower(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToLower, "unicode_tolower"); + return unicode_case_operation(arg, PyUnicode_ToLower); } /* Test PyUnicode_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToUpper, "unicode_toupper"); + return unicode_case_operation(arg, PyUnicode_ToUpper); } @@ -267,14 +263,14 @@ unicode_toupper(PyObject *self, PyObject *arg) static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToTitle, "unicode_totitle"); + return unicode_case_operation(arg, PyUnicode_ToTitle); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToFolded, "unicode_tofolded"); + return unicode_case_operation(arg, PyUnicode_ToFolded); } From 6a974c44b5767aa95319e412de0091bae02fea18 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:19:05 +0200 Subject: [PATCH 09/12] Disallow passing NULL --- Doc/c-api/unicode.rst | 20 +++++------- Objects/unicodectype.c | 73 ++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 54 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 9021b6142ed199..11dd600c669243 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -312,9 +312,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased (e.g. a maximum of two character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -324,9 +323,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -336,9 +334,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -348,9 +345,8 @@ These APIs can be used for fast direct character conversions: Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 66a7d9d85e67cd..ec0ae918b339ee 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -207,24 +207,20 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->lower >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->lower; + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->lower; return 1; } @@ -237,23 +233,20 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->title >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->title; + + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->title; return 1; } @@ -266,23 +259,20 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->upper >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->upper; + + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->upper; return 1; } @@ -295,16 +285,15 @@ int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) int n = (ctype->lower >> 20) & 7; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } + return PyUnicode_ToLower(ch, res, size); } From ae033ff5d93131320b78b4efb33daaed4297fadf Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:29:10 +0200 Subject: [PATCH 10/12] Only return NULL when chars < 0 in C test functions Co-authored-by: Victor Stinner --- Modules/_testcapi/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 057bc3b7a6f1c9..cb1e2df5739211 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -237,7 +237,7 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) Py_UCS4 buf[3]; int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); - if (chars <= 0) { + if (chars < 0) { return NULL; } From e7ef477c36fdd2892c68243d257ec8584cd4ad7a Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 15:00:00 +0200 Subject: [PATCH 11/12] Use Py_ssize_t and don't check overflow in loop --- Doc/c-api/unicode.rst | 8 ++--- Include/cpython/unicodeobject.h | 16 ++++----- Modules/_testcapi/unicode.c | 4 +-- Objects/unicodectype.c | 62 ++++++++++++++++----------------- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 11dd600c669243..65966fb0180220 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,7 +307,7 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased @@ -318,7 +318,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased @@ -329,7 +329,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased @@ -340,7 +340,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index f9142af0057b78..ea9f9b5921c3c2 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,28 +733,28 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(int) PyUnicode_ToLower( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToLower( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToUpper( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToUpper( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToTitle( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToTitle( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToFolded( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToFolded( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index cb1e2df5739211..21f6c0f62f11f5 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,7 +221,7 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t)) { if (!PyUnicode_Check(str)) { PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); @@ -236,7 +236,7 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); Py_UCS4 buf[3]; - int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + Py_ssize_t chars = function(c, buf, Py_ARRAY_LENGTH(buf)); if (chars < 0) { return NULL; } diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index ec0ae918b339ee..da70f60b12c450 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,25 +198,25 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -224,25 +224,25 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -250,25 +250,25 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -276,21 +276,21 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } From fff25db403f4ffc5fa56ef3b14250ee229611368 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 19:59:13 +0200 Subject: [PATCH 12/12] Use Py_ssize_t for return value variable in unicodeobject.c --- Objects/unicodeobject.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 05e9dbf7d3fa51..38b214df74a4df 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10037,9 +10037,9 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped, int mapped_size) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { @@ -10052,8 +10052,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); @@ -10081,7 +10080,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } @@ -10109,7 +10108,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else @@ -10143,7 +10142,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + Py_ssize_t j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10163,7 +10162,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped));