diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 84fee05cb4ce20..65966fb0180220 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,51 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (e.g. a maximum of two character for Unicode 16.0), and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased + (e.g. a maximum of three character for Unicode 16.0), and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased + (e.g. a maximum of three character for Unicode 16.0), and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased + (e.g. a maximum of three character for Unicode 16.0), and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + .. versionadded:: next + + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 86c502730f478c..ea9f9b5921c3c2 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,31 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 3791b913c17546..c83a221bb6a3a1 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,10 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..931ce47ed2911e 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,61 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..65b5c45a33a895 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUnicode_ToLower`, :c:func:`PyUnicode_ToUpper`, :c:func:`PyUnicode_ToTitle` and :c:func:`PyUnicode_ToFolded` public. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..21f6c0f62f11f5 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,60 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +static PyObject * +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t)) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); + + Py_UCS4 buf[3]; + Py_ssize_t chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + if (chars < 0) { + return NULL; + } + + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); +} + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToLower); +} + +/* Test PyUnicode_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToUpper); +} + + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToTitle); +} + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToFolded); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +631,10 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..da70f60b12c450 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,67 +198,103 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->lower; return 1; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->title; return 1; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->upper; return 1; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } - return _PyUnicode_ToLowerFull(ch, res); + + return PyUnicode_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..38b214df74a4df 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10037,34 +10037,35 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUnicode_ToLower(c, mapped, mapped_size); } static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10079,17 +10080,18 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10106,11 +10108,12 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10139,7 +10142,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + Py_ssize_t j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10158,13 +10162,13 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToTitleFull(c, mapped); - + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j];