diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h index 8cd141545bbb..da6113ed427b 100644 --- a/mypyc/lib-rt/CPy.h +++ b/mypyc/lib-rt/CPy.h @@ -759,6 +759,8 @@ PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors); Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start); Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end); CPyTagged CPyStr_Ord(PyObject *obj); +PyObject *CPyStr_Lower(PyObject *self); +PyObject *CPyStr_Upper(PyObject *self); // Bytes operations diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c index 337ef14fc955..9e18bfa21250 100644 --- a/mypyc/lib-rt/str_ops.c +++ b/mypyc/lib-rt/str_ops.c @@ -606,3 +606,79 @@ CPyTagged CPyStr_Ord(PyObject *obj) { PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s); return CPY_INT_TAG; } + +PyObject *CPyStr_Lower(PyObject *self) { + if (PyUnicode_READY(self) == -1) + return NULL; + + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + + // Fast path: ASCII only + if (PyUnicode_IS_ASCII(self)) { + PyObject *res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); + Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = 0; i < len; i++) { + res_data[i] = Py_TOLOWER((unsigned char) data[i]); + } + return res; + } + + // General Unicode path + int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + PyObject *res = PyUnicode_New(len, maxchar); + if (res == NULL) + return NULL; + int res_kind = PyUnicode_KIND(res); + void *res_data = PyUnicode_DATA(res); + + // Unified loop for all Unicode kinds + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 rch = Py_UNICODE_TOLOWER(ch); + PyUnicode_WRITE(res_kind, res_data, i, rch); + } + return res; +} + +PyObject *CPyStr_Upper(PyObject *self) { + if (PyUnicode_READY(self) == -1) + return NULL; + + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + + // Fast path: ASCII only + if (PyUnicode_IS_ASCII(self)) { + PyObject *res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); + Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = 0; i < len; i++) { + res_data[i] = Py_TOUPPER((unsigned char) data[i]); + } + return res; + } + + // General Unicode path + int kind = PyUnicode_KIND(self); + void *data = PyUnicode_DATA(self); + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + PyObject *res = PyUnicode_New(len, maxchar); + if (res == NULL) + return NULL; + int res_kind = PyUnicode_KIND(res); + void *res_data = PyUnicode_DATA(res); + + // Unified loop for all Unicode kinds + for (Py_ssize_t i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + Py_UCS4 rch = Py_UNICODE_TOUPPER(ch); + PyUnicode_WRITE(res_kind, res_data, i, rch); + } + return res; +} diff --git a/mypyc/primitives/str_ops.py b/mypyc/primitives/str_ops.py index a8f4e4df74c2..6abd2a5b5e3c 100644 --- a/mypyc/primitives/str_ops.py +++ b/mypyc/primitives/str_ops.py @@ -480,3 +480,21 @@ c_function_name="CPyStr_Ord", error_kind=ERR_MAGIC, ) + +# str.lower() +method_op( + name="lower", + arg_types=[str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPyStr_Lower", + error_kind=ERR_MAGIC, +) + +# str.upper() +method_op( + name="upper", + arg_types=[str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPyStr_Upper", + error_kind=ERR_MAGIC, +) diff --git a/mypyc/test-data/fixtures/ir.py b/mypyc/test-data/fixtures/ir.py index c041c661741c..83dae8553d19 100644 --- a/mypyc/test-data/fixtures/ir.py +++ b/mypyc/test-data/fixtures/ir.py @@ -113,7 +113,6 @@ def lstrip(self, item: Optional[str] = None) -> str: pass def rstrip(self, item: Optional[str] = None) -> str: pass def join(self, x: Iterable[str]) -> str: pass def format(self, *args: Any, **kwargs: Any) -> str: ... - def upper(self) -> str: ... def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ... def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ... def replace(self, old: str, new: str, maxcount: int=...) -> str: ... @@ -123,6 +122,8 @@ def rpartition(self, sep: str, /) -> Tuple[str, str, str]: ... def removeprefix(self, prefix: str, /) -> str: ... def removesuffix(self, suffix: str, /) -> str: ... def islower(self) -> bool: ... + def lower(self) -> str: ... + def upper(self) -> str: ... def count(self, substr: str, start: Optional[int] = None, end: Optional[int] = None) -> int: pass class float: diff --git a/mypyc/test-data/irbuild-str.test b/mypyc/test-data/irbuild-str.test index 245acf7402a1..46890e8a3b0a 100644 --- a/mypyc/test-data/irbuild-str.test +++ b/mypyc/test-data/irbuild-str.test @@ -630,6 +630,27 @@ L3: L4: return r6 + +[case testLower] +def do_lower(s: str) -> str: + return s.lower() +[out] +def do_lower(s): + s, r0 :: str +L0: + r0 = CPyStr_Lower(s) + return r0 + +[case testUpper] +def do_upper(s: str) -> str: + return s.upper() +[out] +def do_upper(s): + s, r0 :: str +L0: + r0 = CPyStr_Upper(s) + return r0 + [case testFStringFromConstants] from typing import Final string: Final = "abc" diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test index b4f3ebd66910..02a9e86362f2 100644 --- a/mypyc/test-data/run-strings.test +++ b/mypyc/test-data/run-strings.test @@ -1029,6 +1029,33 @@ def test_count_multi_start_end_emoji() -> None: assert string.count("🚀🚀🚀", 0, 12) == 2, string.count("🚀🚀🚀", 0, 12) assert string.count("ñññ", 0, 12) == 1, string.count("ñññ", 0, 12) +[case testLower] +def test_str_lower() -> None: + assert "".lower() == "" + assert "ABC".lower() == "abc" + assert "abc".lower() == "abc" + assert "AbC123".lower() == "abc123" + assert "áÉÍ".lower() == "áéí" + assert "😴🚀".lower() == "😴🚀" + # Special + assert "SS".lower() == "ss" + assert "Σ".lower() == "σ" # Greek capital sigma -> small sigma + #assert "İ".lower() == "i̇" # TODO: Latin capital letter I with dot above -> 'i' + combining dot + #assert len("İ".lower()) == 2 # TODO: Confirms length change + +[case testUpper] +def test_str_upper() -> None: + assert "".upper() == "" + assert "abc".upper() == "ABC" + assert "ABC".upper() == "ABC" + assert "AbC123".upper() == "ABC123" + assert "áéí".upper() == "ÁÉÍ" + assert "😴🚀".upper() == "😴🚀" + # Special + #assert "ß".upper() == "SS" # TODO: German sharp S -> double S + #assert "ffi".upper() == "FFI" # TODO: Ligature 'ffi' -> separate letters + #assert len("ffi".upper()) == 3 # TODO: Confirm length increases + [case testIsInstance] from copysubclass import subc from typing import Any