Skip to content

Commit 1b6ebb1

Browse files
authored
[mypyc] Enable SIMD for librt.base64 on x86-64 (#20244)
Also generally enable SSE4.2 instructions when targeting x86-64. These have been supported by hardware since ~2010, so it seems fine to require them now. This speeds up `b64encode` by up to 100% on Linux running on a recent AMD CPU. Some fairly recent hardware doesn't support AVX2, so it's not enabled. We'd probably need to rely on hardware capability checking for AVX2 support, and we'd need compile different files with different architecture flags probably, and I didn't want to go there (at least not yet).
1 parent 8f922b3 commit 1b6ebb1

File tree

4 files changed

+29
-1
lines changed

4 files changed

+29
-1
lines changed

mypyc/build.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from mypy.util import write_junit_xml
3737
from mypyc.annotate import generate_annotated_html
3838
from mypyc.codegen import emitmodule
39-
from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, shared_lib_name
39+
from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, X86_64, shared_lib_name
4040
from mypyc.errors import Errors
4141
from mypyc.ir.pprint import format_modules
4242
from mypyc.namegen import exported_name
@@ -77,6 +77,12 @@ class ModDesc(NamedTuple):
7777
"base64/arch/generic/enc_tail.c",
7878
"base64/arch/generic/dec_head.c",
7979
"base64/arch/generic/dec_tail.c",
80+
"base64/arch/ssse3/dec_reshuffle.c",
81+
"base64/arch/ssse3/dec_loop.c",
82+
"base64/arch/ssse3/enc_loop_asm.c",
83+
"base64/arch/ssse3/enc_translate.c",
84+
"base64/arch/ssse3/enc_reshuffle.c",
85+
"base64/arch/ssse3/enc_loop.c",
8086
"base64/arch/neon64/dec_loop.c",
8187
"base64/arch/neon64/enc_loop_asm.c",
8288
"base64/codecs.h",
@@ -655,6 +661,9 @@ def mypycify(
655661
# See https://github.com/mypyc/mypyc/issues/956
656662
"-Wno-cpp",
657663
]
664+
if X86_64:
665+
# Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2.
666+
cflags.append("-msse4.2")
658667
if log_trace:
659668
cflags.append("-DMYPYC_LOG_TRACE")
660669
if experimental_features:
@@ -683,6 +692,10 @@ def mypycify(
683692
# that we actually get the compilation speed and memory
684693
# use wins that multi-file mode is intended for.
685694
cflags += ["/GL-", "/wd9025"] # warning about overriding /GL
695+
if X86_64:
696+
# Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2.
697+
# Also Windows 11 requires SSE4.2 since 24H2.
698+
cflags.append("/arch:SSE4.2")
686699
if log_trace:
687700
cflags.append("/DMYPYC_LOG_TRACE")
688701
if experimental_features:

mypyc/common.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import platform
34
import sys
45
import sysconfig
56
from typing import Any, Final
@@ -44,6 +45,8 @@
4445

4546
IS_32_BIT_PLATFORM: Final = int(SIZEOF_SIZE_T) == 4
4647

48+
X86_64: Final = platform.machine() in ("x86_64", "AMD64", "amd64")
49+
4750
PLATFORM_SIZE = 4 if IS_32_BIT_PLATFORM else 8
4851

4952
# Maximum value for a short tagged integer.

mypyc/lib-rt/base64/config.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,12 @@
77
#define BASE64_WITH_SSE41 0
88
#define HAVE_SSE41 BASE64_WITH_SSE41
99

10+
#if defined(__x86_64__) || defined(_M_X64)
11+
#define BASE64_WITH_SSE42 1
12+
#else
1013
#define BASE64_WITH_SSE42 0
14+
#endif
15+
1116
#define HAVE_SSE42 BASE64_WITH_SSE42
1217

1318
#define BASE64_WITH_AVX 0

mypyc/lib-rt/setup.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from __future__ import annotations
77

88
import os
9+
import platform
910
import subprocess
1011
import sys
1112
from distutils import ccompiler, sysconfig
@@ -24,6 +25,8 @@
2425
"pythonsupport.c",
2526
]
2627

28+
X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64")
29+
2730

2831
class BuildExtGtest(build_ext):
2932
def get_library_names(self) -> list[str]:
@@ -79,8 +82,12 @@ def run(self) -> None:
7982
cflags: list[str] = []
8083
if compiler.compiler_type == "unix":
8184
cflags += ["-O3"]
85+
if X86_64:
86+
cflags.append("-msse4.2") # Enable SIMD (see also mypyc/build.py)
8287
elif compiler.compiler_type == "msvc":
8388
cflags += ["/O2"]
89+
if X86_64:
90+
cflags.append("/arch:SSE4.2") # Enable SIMD (see also mypyc/build.py)
8491

8592
setup(
8693
ext_modules=[

0 commit comments

Comments
 (0)