From 4f295accd37fbf04b156b23e91beb559bf57e37c Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Sat, 5 Jul 2025 13:09:08 +0100 Subject: [PATCH 1/4] [mypyc] Add support for C string literals in the IR Previously only Python str and bytes literals were supported, but sometimes we want zero-terminated C string literals instead. They don't need to be allocated from heap and are usually stored in a read-only data section, so they are more efficient in some use cases. --- mypyc/codegen/emitfunc.py | 26 ++++++++++++++++++++++++++ mypyc/ir/ops.py | 15 +++++++++++++++ mypyc/ir/pprint.py | 3 +++ mypyc/ir/rtypes.py | 10 ++++++---- mypyc/test/test_emitfunc.py | 26 ++++++++++++++++++++++++++ 5 files changed, 76 insertions(+), 4 deletions(-) diff --git a/mypyc/codegen/emitfunc.py b/mypyc/codegen/emitfunc.py index 00c7fd56b899..e067f524b1a2 100644 --- a/mypyc/codegen/emitfunc.py +++ b/mypyc/codegen/emitfunc.py @@ -33,6 +33,7 @@ Cast, ComparisonOp, ControlOp, + CString, DecRef, Extend, Float, @@ -843,6 +844,8 @@ def reg(self, reg: Value) -> str: elif r == "nan": return "NAN" return r + elif isinstance(reg, CString): + return '"' + encode_c_string_literal(reg.value) + '"' else: return self.emitter.reg(reg) @@ -904,3 +907,26 @@ def emit_unsigned_int_cast(self, type: RType) -> str: return "(uint64_t)" else: return "" + + +_translation_table: Final[dict[int, str]] = {} + + +def encode_c_string_literal(b: bytes) -> str: + if not _translation_table: + # Initialize the translation table on the first call. + d = { + ord("\n"): "\\n", + ord("\r"): "\\r", + ord("\t"): "\\t", + ord('"'): '\\"', + ord("\\"): "\\\\", + } + for i in range(256): + if i not in d: + if i < 32 or i >= 127: + d[i] = "\\x%.2x" % i + else: + d[i] = chr(i) + _translation_table.update(str.maketrans(d)) + return b.decode("latin1").translate(_translation_table) diff --git a/mypyc/ir/ops.py b/mypyc/ir/ops.py index 1cb3df916ac9..47b66fd2a1e3 100644 --- a/mypyc/ir/ops.py +++ b/mypyc/ir/ops.py @@ -39,6 +39,7 @@ class to enable the new behavior. Sometimes adding a new abstract RVoid, bit_rprimitive, bool_rprimitive, + cstring_rprimitive, float_rprimitive, int_rprimitive, is_bit_rprimitive, @@ -230,6 +231,20 @@ def __init__(self, value: float, line: int = -1) -> None: self.line = line +@final +class CString(Value): + """C string literal (zero-terminated). + + You can also include zero values in the value, but then you'll need to track + the length of the string separately. + """ + + def __init__(self, value: bytes, line: int = -1) -> None: + self.value = value + self.type = cstring_rprimitive + self.line = line + + class Op(Value): """Abstract base class for all IR operations. diff --git a/mypyc/ir/pprint.py b/mypyc/ir/pprint.py index 6c96a21e473b..5bb11cc231cc 100644 --- a/mypyc/ir/pprint.py +++ b/mypyc/ir/pprint.py @@ -21,6 +21,7 @@ Cast, ComparisonOp, ControlOp, + CString, DecRef, Extend, Float, @@ -327,6 +328,8 @@ def format(self, fmt: str, *args: Any) -> str: result.append(str(arg.value)) elif isinstance(arg, Float): result.append(repr(arg.value)) + elif isinstance(arg, CString): + result.append(f"CString({arg.value!r})") else: result.append(self.names[arg]) elif typespec == "d": diff --git a/mypyc/ir/rtypes.py b/mypyc/ir/rtypes.py index 61aadce9b9d4..8dc7d5c9c949 100644 --- a/mypyc/ir/rtypes.py +++ b/mypyc/ir/rtypes.py @@ -254,13 +254,11 @@ def __init__( elif ctype == "CPyPtr": # TODO: Invent an overlapping error value? self.c_undefined = "0" - elif ctype == "PyObject *": - # Boxed types use the null pointer as the error value. + elif ctype.endswith("*"): + # Boxed and pointer types use the null pointer as the error value. self.c_undefined = "NULL" elif ctype == "char": self.c_undefined = "2" - elif ctype in ("PyObject **", "void *"): - self.c_undefined = "NULL" elif ctype == "double": self.c_undefined = "-113.0" elif ctype in ("uint8_t", "uint16_t", "uint32_t", "uint64_t"): @@ -445,6 +443,10 @@ def __hash__(self) -> int: "c_ptr", is_unboxed=False, is_refcounted=False, ctype="void *" ) +cstring_rprimitive: Final = RPrimitive( + "cstring", is_unboxed=True, is_refcounted=False, ctype="const char *" +) + # The type corresponding to mypyc.common.BITMAP_TYPE bitmap_rprimitive: Final = uint32_rprimitive diff --git a/mypyc/test/test_emitfunc.py b/mypyc/test/test_emitfunc.py index 275e8c383a4b..8d1b66bb5915 100644 --- a/mypyc/test/test_emitfunc.py +++ b/mypyc/test/test_emitfunc.py @@ -19,6 +19,7 @@ CallC, Cast, ComparisonOp, + CString, DecRef, Extend, GetAttr, @@ -49,6 +50,7 @@ RType, bool_rprimitive, c_int_rprimitive, + cstring_rprimitive, dict_rprimitive, int32_rprimitive, int64_rprimitive, @@ -824,6 +826,30 @@ def test_inc_ref_int_literal(self) -> None: b = LoadLiteral(x, object_rprimitive) self.assert_emit([b, IncRef(b)], "CPy_INCREF(cpy_r_r0);") + def test_c_string(self) -> None: + s = Register(cstring_rprimitive, "s") + self.assert_emit(Assign(s, CString(b"foo")), """cpy_r_s = "foo";""") + self.assert_emit(Assign(s, CString(b'fo "o')), r"""cpy_r_s = "fo \"o";""") + self.assert_emit(Assign(s, CString(b"\x00")), r"""cpy_r_s = "\x00";""") + self.assert_emit(Assign(s, CString(b"\\")), r"""cpy_r_s = "\\";""") + for i in range(256): + b = bytes([i]) + if b == b"\n": + target = "\\n" + elif b == b"\r": + target = "\\r" + elif b == b"\t": + target = "\\t" + elif b == b'"': + target = '\\"' + elif b == b"\\": + target = "\\\\" + elif i < 32 or i >= 127: + target = "\\x%.2x" % i + else: + target = b.decode("ascii") + self.assert_emit(Assign(s, CString(b)), f'cpy_r_s = "{target}";') + def assert_emit( self, op: Op | list[Op], From b0399670480147eed97ecf5fa206cbdc9940438b Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Sun, 6 Jul 2025 12:28:31 +0100 Subject: [PATCH 2/4] Please lint --- mypyc/test/test_emitfunc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/test/test_emitfunc.py b/mypyc/test/test_emitfunc.py index 8d1b66bb5915..2042151fabdb 100644 --- a/mypyc/test/test_emitfunc.py +++ b/mypyc/test/test_emitfunc.py @@ -829,7 +829,7 @@ def test_inc_ref_int_literal(self) -> None: def test_c_string(self) -> None: s = Register(cstring_rprimitive, "s") self.assert_emit(Assign(s, CString(b"foo")), """cpy_r_s = "foo";""") - self.assert_emit(Assign(s, CString(b'fo "o')), r"""cpy_r_s = "fo \"o";""") + self.assert_emit(Assign(s, CString(b'foo "o')), r"""cpy_r_s = "foo \"o";""") self.assert_emit(Assign(s, CString(b"\x00")), r"""cpy_r_s = "\x00";""") self.assert_emit(Assign(s, CString(b"\\")), r"""cpy_r_s = "\\";""") for i in range(256): From 33586a4272a37f52e6bab41c9da8ba7402372a88 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Sun, 6 Jul 2025 12:34:32 +0100 Subject: [PATCH 3/4] Add docstring --- mypyc/codegen/emitfunc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mypyc/codegen/emitfunc.py b/mypyc/codegen/emitfunc.py index e067f524b1a2..f935953b33a5 100644 --- a/mypyc/codegen/emitfunc.py +++ b/mypyc/codegen/emitfunc.py @@ -913,6 +913,10 @@ def emit_unsigned_int_cast(self, type: RType) -> str: def encode_c_string_literal(b: bytes) -> str: + """Convert bytestring to the C string literal syntax (with necessary escaping). + + For example, b'foo\n' gets coverted to 'foo\\n' (note that double quotes are not added). + """ if not _translation_table: # Initialize the translation table on the first call. d = { From 88774582b8c61bea48f81cad641d2291482bdf69 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Sun, 6 Jul 2025 12:49:13 +0100 Subject: [PATCH 4/4] Fix typo --- mypyc/codegen/emitfunc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/codegen/emitfunc.py b/mypyc/codegen/emitfunc.py index f935953b33a5..5acae4cc9be6 100644 --- a/mypyc/codegen/emitfunc.py +++ b/mypyc/codegen/emitfunc.py @@ -915,7 +915,7 @@ def emit_unsigned_int_cast(self, type: RType) -> str: def encode_c_string_literal(b: bytes) -> str: """Convert bytestring to the C string literal syntax (with necessary escaping). - For example, b'foo\n' gets coverted to 'foo\\n' (note that double quotes are not added). + For example, b'foo\n' gets converted to 'foo\\n' (note that double quotes are not added). """ if not _translation_table: # Initialize the translation table on the first call.