From ec4bf2652a08b82b679d18d7b1665f04beb21097 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 01:45:23 +0100 Subject: [PATCH 01/15] test --- .github/workflows/test.yml | 3 +++ test/unit/test_cpu_fp16_alu.py | 13 +++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 test/unit/test_cpu_fp16_alu.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 68611914b78e9..5a7c9d82fac84 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -910,6 +910,9 @@ jobs: - name: Run macOS-specific unit test if: matrix.backend == 'cpu' run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated + - name: fp16 test + if: matrix.backend == 'cpu' + run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu:test_cpu # ****** Windows Tests ****** diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py new file mode 100644 index 0000000000000..424dde7f04737 --- /dev/null +++ b/test/unit/test_cpu_fp16_alu.py @@ -0,0 +1,13 @@ +import unittest +from tinygrad import Tensor, dtypes, Context +from tinygrad.device import Device +from tinygrad.helpers import OSX + +class TestFloat16Alu(unittest.TestCase): + @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "") + def test_cpu(self): + with Context(DEBUG=7): + a = Tensor([1], dtype=dtypes.float16) + b = Tensor([2], dtype=dtypes.float16) + c = (a + b).realize() + \ No newline at end of file From 731b3ea63c35796c68beb8dd3b8aab0b0c193f6f Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 02:01:02 +0100 Subject: [PATCH 02/15] l --- test/unit/test_cpu_fp16_alu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py index 424dde7f04737..72a73294fa5fa 100644 --- a/test/unit/test_cpu_fp16_alu.py +++ b/test/unit/test_cpu_fp16_alu.py @@ -10,4 +10,6 @@ def test_cpu(self): a = Tensor([1], dtype=dtypes.float16) b = Tensor([2], dtype=dtypes.float16) c = (a + b).realize() - \ No newline at end of file + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From ab9498420f92cb5a961ae656edd95d266e345176 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 02:25:10 +0100 Subject: [PATCH 03/15] giga l --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5a7c9d82fac84..2dda439a81cc9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -912,7 +912,7 @@ jobs: run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated - name: fp16 test if: matrix.backend == 'cpu' - run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu:test_cpu + run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu::test_cpu # ****** Windows Tests ****** From 96648995a4afb787364c860ae37a9dabed3a3813 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 03:07:25 +0100 Subject: [PATCH 04/15] galactic l --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2dda439a81cc9..b67fa69606485 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -896,6 +896,7 @@ jobs: with: key: macos-${{ matrix.backend }}-minimal deps: testing_minimal + pydeps: "capstone" llvm: ${{ matrix.backend == 'llvm' && 'true' }} - name: Set env run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV From e0e85ff1ea2bd9c53f48f5d237de1d54c2ccb7bd Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 03:23:23 +0100 Subject: [PATCH 05/15] interdimensional l --- .github/workflows/test.yml | 2 +- test/unit/test_cpu_fp16_alu.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b67fa69606485..8959f7b56364e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -913,7 +913,7 @@ jobs: run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated - name: fp16 test if: matrix.backend == 'cpu' - run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu::test_cpu + run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu # ****** Windows Tests ****** diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py index 72a73294fa5fa..78642efec8ff4 100644 --- a/test/unit/test_cpu_fp16_alu.py +++ b/test/unit/test_cpu_fp16_alu.py @@ -1,15 +1,14 @@ import unittest -from tinygrad import Tensor, dtypes, Context +from tinygrad import Tensor, dtypes from tinygrad.device import Device from tinygrad.helpers import OSX class TestFloat16Alu(unittest.TestCase): @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "") def test_cpu(self): - with Context(DEBUG=7): - a = Tensor([1], dtype=dtypes.float16) - b = Tensor([2], dtype=dtypes.float16) - c = (a + b).realize() + a = Tensor([1], dtype=dtypes.float16) + b = Tensor([2], dtype=dtypes.float16) + c = (a + b).realize() if __name__ == "__main__": unittest.main() \ No newline at end of file From 2cf78004899433dfb3904c38e5474c9f6149555d Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 17:50:31 +0100 Subject: [PATCH 06/15] does it still cast with _Float16? --- tinygrad/renderer/cstyle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 15592b8ecea8c..58dc991fa3077 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -193,7 +193,7 @@ class ClangRenderer(CStyleLanguage): # language options buffer_suffix = " restrict" - type_map = {dtypes.bool:"_Bool", dtypes.half:"__fp16"} + type_map = {dtypes.bool:"_Bool", dtypes.half:"_Float16"} code_for_op = {**({k:v for k,v in CStyleLanguage.code_for_op.items() if k not in [Ops.EXP2, Ops.SIN, Ops.LOG2]}), Ops.SQRT: lambda x,dtype: f"__builtin_sqrt({x})" if dtype == dtypes.float64 else f"__builtin_sqrtf({x})"} # LLVM legalizes double => half cast on systems that don't support it natively (like x86 cpus without AVX512-FP16) into a compiler-rt libcall. From 33ad6d395e89992316e84089533da891e748b9b5 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 18:07:05 +0100 Subject: [PATCH 07/15] are if it's vectorized? --- test/unit/test_cpu_fp16_alu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py index 78642efec8ff4..57b73d3894829 100644 --- a/test/unit/test_cpu_fp16_alu.py +++ b/test/unit/test_cpu_fp16_alu.py @@ -6,8 +6,8 @@ class TestFloat16Alu(unittest.TestCase): @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "") def test_cpu(self): - a = Tensor([1], dtype=dtypes.float16) - b = Tensor([2], dtype=dtypes.float16) + a = Tensor([1,2,3,4], dtype=dtypes.float16) + b = Tensor([1,2,3,4], dtype=dtypes.float16) c = (a + b).realize() if __name__ == "__main__": From 000ae3e5631e404398580cfb377c495600fedec5 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 18:22:47 +0100 Subject: [PATCH 08/15] what about now? --- tinygrad/runtime/ops_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index e180a0a6b3b9e..6f52f0e20e469 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + args = ['-march=native'+('fp16' if target == 'arm64' else ''), f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From db01ad4c010b14bcc817720a7d8951d8235d3e64 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 19:47:09 +0100 Subject: [PATCH 09/15] what if I use llvm? --- .github/workflows/test.yml | 2 +- test/unit/test_cpu_fp16_alu.py | 2 +- tinygrad/runtime/ops_cpu.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8959f7b56364e..e796e7f270bc0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -912,7 +912,7 @@ jobs: if: matrix.backend == 'cpu' run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated - name: fp16 test - if: matrix.backend == 'cpu' + if: matrix.backend == 'llvm' run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu # ****** Windows Tests ****** diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py index 57b73d3894829..b6a23c4b2c41c 100644 --- a/test/unit/test_cpu_fp16_alu.py +++ b/test/unit/test_cpu_fp16_alu.py @@ -4,7 +4,7 @@ from tinygrad.helpers import OSX class TestFloat16Alu(unittest.TestCase): - @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "") + @unittest.skipUnless(Device.DEFAULT == "LLVM" and OSX, "") def test_cpu(self): a = Tensor([1,2,3,4], dtype=dtypes.float16) b = Tensor([1,2,3,4], dtype=dtypes.float16) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 6f52f0e20e469..e180a0a6b3b9e 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - args = ['-march=native'+('fp16' if target == 'arm64' else ''), f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From 8a82ee999f711cc2bf1b6075cb47593bfa31db93 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 21:06:52 +0100 Subject: [PATCH 10/15] ill see for myself --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e796e7f270bc0..e3a6b0ca59105 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -914,6 +914,8 @@ jobs: - name: fp16 test if: matrix.backend == 'llvm' run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu + - name: print cpu info + run: sysctl -a | grep machdep.cpu # ****** Windows Tests ****** From fa797fd75629dc032aa4565e19779d5e6aa3ad16 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 21:58:11 +0100 Subject: [PATCH 11/15] how much does this break? --- .github/workflows/test.yml | 4 +--- test/unit/test_cpu_fp16_alu.py | 2 +- tinygrad/runtime/ops_cpu.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e3a6b0ca59105..8959f7b56364e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -912,10 +912,8 @@ jobs: if: matrix.backend == 'cpu' run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated - name: fp16 test - if: matrix.backend == 'llvm' + if: matrix.backend == 'cpu' run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu - - name: print cpu info - run: sysctl -a | grep machdep.cpu # ****** Windows Tests ****** diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py index b6a23c4b2c41c..57b73d3894829 100644 --- a/test/unit/test_cpu_fp16_alu.py +++ b/test/unit/test_cpu_fp16_alu.py @@ -4,7 +4,7 @@ from tinygrad.helpers import OSX class TestFloat16Alu(unittest.TestCase): - @unittest.skipUnless(Device.DEFAULT == "LLVM" and OSX, "") + @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "") def test_cpu(self): a = Tensor([1,2,3,4], dtype=dtypes.float16) b = Tensor([1,2,3,4], dtype=dtypes.float16) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index e180a0a6b3b9e..eb57eadcbffdf 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + args = ['-march=native', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From 88584545fe81799f3a5a21f6a13d83c6f8b96e5a Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 22:20:09 +0100 Subject: [PATCH 12/15] what about now? --- tinygrad/runtime/ops_cpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index eb57eadcbffdf..d866629e50f42 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -1,5 +1,5 @@ import functools, platform, subprocess, sys -from tinygrad.helpers import capstone_flatdump, getenv +from tinygrad.helpers import capstone_flatdump, getenv, OSX from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram from tinygrad.runtime.support.elf import jit_loader from tinygrad.renderer.cstyle import ClangRenderer @@ -11,7 +11,8 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - args = ['-march=native', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + target += '-apple' if OSX else '-none' + args = ['-march=native', f'--target={target}-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From 41d30809982668d4ddce538d10e274d8b1497159 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 23:07:41 +0100 Subject: [PATCH 13/15] what about now? --- tinygrad/runtime/ops_cpu.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index d866629e50f42..97b115bbc2bfa 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -1,8 +1,9 @@ -import functools, platform, subprocess, sys +import functools, platform, subprocess, sys, ctypes from tinygrad.helpers import capstone_flatdump, getenv, OSX from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram from tinygrad.runtime.support.elf import jit_loader from tinygrad.renderer.cstyle import ClangRenderer +from tinygrad.runtime.autogen.llvm import LLVMGetHostCPUName class ClangJITCompiler(Compiler): def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey) @@ -11,8 +12,8 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - target += '-apple' if OSX else '-none' - args = ['-march=native', f'--target={target}-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + cpu = f"-mcpu={ctypes.string_at(LLVMGetHostCPUName()).decode()}" + args = [cpu, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From 12eafc9d944039363019700633a6efd63d7e1b5b Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 23:30:59 +0100 Subject: [PATCH 14/15] fix --- autogen_stubs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 5c4e684e3d52e..6e444cdd0d766 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -441,6 +441,7 @@ elif [ "$1" == "qcom" ]; then generate_qcom elif [ "$1" == "io_uring" ]; then generate_io_uring elif [ "$1" == "libc" ]; then generate_libc elif [ "$1" == "llvm" ]; then generate_llvm +elif [ "$1" == "cpu" ]; then generate_llvm elif [ "$1" == "kgsl" ]; then generate_kgsl elif [ "$1" == "adreno" ]; then generate_adreno elif [ "$1" == "pci" ]; then generate_pci From d3d3e87e51c584b5c772fddea7d8f5e44071aa0a Mon Sep 17 00:00:00 2001 From: ttomsa Date: Mon, 16 Jun 2025 23:45:26 +0100 Subject: [PATCH 15/15] that was dumb, this should do --- autogen_stubs.sh | 1 - tinygrad/runtime/ops_cpu.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 6e444cdd0d766..5c4e684e3d52e 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -441,7 +441,6 @@ elif [ "$1" == "qcom" ]; then generate_qcom elif [ "$1" == "io_uring" ]; then generate_io_uring elif [ "$1" == "libc" ]; then generate_libc elif [ "$1" == "llvm" ]; then generate_llvm -elif [ "$1" == "cpu" ]; then generate_llvm elif [ "$1" == "kgsl" ]; then generate_kgsl elif [ "$1" == "adreno" ]; then generate_adreno elif [ "$1" == "pci" ]; then generate_pci diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 97b115bbc2bfa..3d677a12fe914 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -3,7 +3,6 @@ from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram from tinygrad.runtime.support.elf import jit_loader from tinygrad.renderer.cstyle import ClangRenderer -from tinygrad.runtime.autogen.llvm import LLVMGetHostCPUName class ClangJITCompiler(Compiler): def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey) @@ -12,8 +11,7 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - cpu = f"-mcpu={ctypes.string_at(LLVMGetHostCPUName()).decode()}" - args = [cpu, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj)