From ec4bf2652a08b82b679d18d7b1665f04beb21097 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 01:45:23 +0100
Subject: [PATCH 01/15] test

---
 .github/workflows/test.yml     |  3 +++
 test/unit/test_cpu_fp16_alu.py | 13 +++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 test/unit/test_cpu_fp16_alu.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 68611914b78e9..5a7c9d82fac84 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -910,6 +910,9 @@ jobs:
       - name: Run macOS-specific unit test
         if: matrix.backend == 'cpu'
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
+      - name: fp16 test
+        if: matrix.backend == 'cpu'
+        run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu:test_cpu
 
 # ****** Windows Tests ******
 
diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
new file mode 100644
index 0000000000000..424dde7f04737
--- /dev/null
+++ b/test/unit/test_cpu_fp16_alu.py
@@ -0,0 +1,13 @@
+import unittest
+from tinygrad import Tensor, dtypes, Context
+from tinygrad.device import Device
+from tinygrad.helpers import OSX
+
+class TestFloat16Alu(unittest.TestCase):
+  @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "")
+  def test_cpu(self):
+    with Context(DEBUG=7):
+      a = Tensor([1], dtype=dtypes.float16)
+      b = Tensor([2], dtype=dtypes.float16)
+      c = (a + b).realize()
+    
\ No newline at end of file

From 731b3ea63c35796c68beb8dd3b8aab0b0c193f6f Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 02:01:02 +0100
Subject: [PATCH 02/15] l

---
 test/unit/test_cpu_fp16_alu.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
index 424dde7f04737..72a73294fa5fa 100644
--- a/test/unit/test_cpu_fp16_alu.py
+++ b/test/unit/test_cpu_fp16_alu.py
@@ -10,4 +10,6 @@ def test_cpu(self):
       a = Tensor([1], dtype=dtypes.float16)
       b = Tensor([2], dtype=dtypes.float16)
       c = (a + b).realize()
-    
\ No newline at end of file
+
+if __name__ == "__main__":
+  unittest.main()
\ No newline at end of file

From ab9498420f92cb5a961ae656edd95d266e345176 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 02:25:10 +0100
Subject: [PATCH 03/15] giga l

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5a7c9d82fac84..2dda439a81cc9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -912,7 +912,7 @@ jobs:
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
       - name: fp16 test
         if: matrix.backend == 'cpu'
-        run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu:test_cpu
+        run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu::test_cpu
 
 # ****** Windows Tests ******
 

From 96648995a4afb787364c860ae37a9dabed3a3813 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 03:07:25 +0100
Subject: [PATCH 04/15] galactic l

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2dda439a81cc9..b67fa69606485 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -896,6 +896,7 @@ jobs:
         with:
           key: macos-${{ matrix.backend }}-minimal
           deps: testing_minimal
+          pydeps: "capstone"
           llvm: ${{ matrix.backend == 'llvm' && 'true' }}
       - name: Set env
         run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV

From e0e85ff1ea2bd9c53f48f5d237de1d54c2ccb7bd Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 03:23:23 +0100
Subject: [PATCH 05/15] interdimensional l

---
 .github/workflows/test.yml     | 2 +-
 test/unit/test_cpu_fp16_alu.py | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b67fa69606485..8959f7b56364e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -913,7 +913,7 @@ jobs:
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
       - name: fp16 test
         if: matrix.backend == 'cpu'
-        run: python3 -m pytest test/unit/test_cpu_fp16_alu.py::TestFloat16Alu::test_cpu
+        run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu
 
 # ****** Windows Tests ******
 
diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
index 72a73294fa5fa..78642efec8ff4 100644
--- a/test/unit/test_cpu_fp16_alu.py
+++ b/test/unit/test_cpu_fp16_alu.py
@@ -1,15 +1,14 @@
 import unittest
-from tinygrad import Tensor, dtypes, Context
+from tinygrad import Tensor, dtypes
 from tinygrad.device import Device
 from tinygrad.helpers import OSX
 
 class TestFloat16Alu(unittest.TestCase):
   @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "")
   def test_cpu(self):
-    with Context(DEBUG=7):
-      a = Tensor([1], dtype=dtypes.float16)
-      b = Tensor([2], dtype=dtypes.float16)
-      c = (a + b).realize()
+    a = Tensor([1], dtype=dtypes.float16)
+    b = Tensor([2], dtype=dtypes.float16)
+    c = (a + b).realize()
 
 if __name__ == "__main__":
   unittest.main()
\ No newline at end of file

From 2cf78004899433dfb3904c38e5474c9f6149555d Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 17:50:31 +0100
Subject: [PATCH 06/15] does it still cast with _Float16?

---
 tinygrad/renderer/cstyle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 15592b8ecea8c..58dc991fa3077 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -193,7 +193,7 @@ class ClangRenderer(CStyleLanguage):
 
   # language options
   buffer_suffix = " restrict"
-  type_map = {dtypes.bool:"_Bool", dtypes.half:"__fp16"}
+  type_map = {dtypes.bool:"_Bool", dtypes.half:"_Float16"}
   code_for_op = {**({k:v for k,v in CStyleLanguage.code_for_op.items() if k not in [Ops.EXP2, Ops.SIN, Ops.LOG2]}),
                  Ops.SQRT: lambda x,dtype: f"__builtin_sqrt({x})" if dtype == dtypes.float64 else f"__builtin_sqrtf({x})"}
   # LLVM legalizes double => half cast on systems that don't support it natively (like x86 cpus without AVX512-FP16) into a compiler-rt libcall.

From 33ad6d395e89992316e84089533da891e748b9b5 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 18:07:05 +0100
Subject: [PATCH 07/15] are if it's vectorized?

---
 test/unit/test_cpu_fp16_alu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
index 78642efec8ff4..57b73d3894829 100644
--- a/test/unit/test_cpu_fp16_alu.py
+++ b/test/unit/test_cpu_fp16_alu.py
@@ -6,8 +6,8 @@
 class TestFloat16Alu(unittest.TestCase):
   @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "")
   def test_cpu(self):
-    a = Tensor([1], dtype=dtypes.float16)
-    b = Tensor([2], dtype=dtypes.float16)
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    b = Tensor([1,2,3,4], dtype=dtypes.float16)
     c = (a + b).realize()
 
 if __name__ == "__main__":

From 000ae3e5631e404398580cfb377c495600fedec5 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 18:22:47 +0100
Subject: [PATCH 08/15] what about now?

---
 tinygrad/runtime/ops_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index e180a0a6b3b9e..6f52f0e20e469 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    args = ['-march=native'+('fp16' if target == 'arm64' else ''), f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From db01ad4c010b14bcc817720a7d8951d8235d3e64 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 19:47:09 +0100
Subject: [PATCH 09/15] what if I use llvm?

---
 .github/workflows/test.yml     | 2 +-
 test/unit/test_cpu_fp16_alu.py | 2 +-
 tinygrad/runtime/ops_cpu.py    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8959f7b56364e..e796e7f270bc0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -912,7 +912,7 @@ jobs:
         if: matrix.backend == 'cpu'
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
       - name: fp16 test
-        if: matrix.backend == 'cpu'
+        if: matrix.backend == 'llvm'
         run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu
 
 # ****** Windows Tests ******
diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
index 57b73d3894829..b6a23c4b2c41c 100644
--- a/test/unit/test_cpu_fp16_alu.py
+++ b/test/unit/test_cpu_fp16_alu.py
@@ -4,7 +4,7 @@
 from tinygrad.helpers import OSX
 
 class TestFloat16Alu(unittest.TestCase):
-  @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "")
+  @unittest.skipUnless(Device.DEFAULT == "LLVM" and OSX, "")
   def test_cpu(self):
     a = Tensor([1,2,3,4], dtype=dtypes.float16)
     b = Tensor([1,2,3,4], dtype=dtypes.float16)
diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index 6f52f0e20e469..e180a0a6b3b9e 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-march=native'+('fp16' if target == 'arm64' else ''), f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From 8a82ee999f711cc2bf1b6075cb47593bfa31db93 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 21:06:52 +0100
Subject: [PATCH 10/15] ill see for myself

---
 .github/workflows/test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e796e7f270bc0..e3a6b0ca59105 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -914,6 +914,8 @@ jobs:
       - name: fp16 test
         if: matrix.backend == 'llvm'
         run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu
+      - name: print cpu info
+        run: sysctl -a | grep machdep.cpu
 
 # ****** Windows Tests ******
 

From fa797fd75629dc032aa4565e19779d5e6aa3ad16 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 21:58:11 +0100
Subject: [PATCH 11/15] how much does this break?

---
 .github/workflows/test.yml     | 4 +---
 test/unit/test_cpu_fp16_alu.py | 2 +-
 tinygrad/runtime/ops_cpu.py    | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e3a6b0ca59105..8959f7b56364e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -912,10 +912,8 @@ jobs:
         if: matrix.backend == 'cpu'
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
       - name: fp16 test
-        if: matrix.backend == 'llvm'
+        if: matrix.backend == 'cpu'
         run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu
-      - name: print cpu info
-        run: sysctl -a | grep machdep.cpu
 
 # ****** Windows Tests ******
 
diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
index b6a23c4b2c41c..57b73d3894829 100644
--- a/test/unit/test_cpu_fp16_alu.py
+++ b/test/unit/test_cpu_fp16_alu.py
@@ -4,7 +4,7 @@
 from tinygrad.helpers import OSX
 
 class TestFloat16Alu(unittest.TestCase):
-  @unittest.skipUnless(Device.DEFAULT == "LLVM" and OSX, "")
+  @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "")
   def test_cpu(self):
     a = Tensor([1,2,3,4], dtype=dtypes.float16)
     b = Tensor([1,2,3,4], dtype=dtypes.float16)
diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index e180a0a6b3b9e..eb57eadcbffdf 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    args = ['-march=native', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From 88584545fe81799f3a5a21f6a13d83c6f8b96e5a Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 22:20:09 +0100
Subject: [PATCH 12/15] what about now?

---
 tinygrad/runtime/ops_cpu.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index eb57eadcbffdf..d866629e50f42 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -1,5 +1,5 @@
 import functools, platform, subprocess, sys
-from tinygrad.helpers import capstone_flatdump, getenv
+from tinygrad.helpers import capstone_flatdump, getenv, OSX
 from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
 from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.renderer.cstyle import ClangRenderer
@@ -11,7 +11,8 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-march=native', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    target += '-apple' if OSX else '-none'
+    args = ['-march=native', f'--target={target}-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From 41d30809982668d4ddce538d10e274d8b1497159 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 23:07:41 +0100
Subject: [PATCH 13/15] what about now?

---
 tinygrad/runtime/ops_cpu.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index d866629e50f42..97b115bbc2bfa 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -1,8 +1,9 @@
-import functools, platform, subprocess, sys
+import functools, platform, subprocess, sys, ctypes
 from tinygrad.helpers import capstone_flatdump, getenv, OSX
 from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
 from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.renderer.cstyle import ClangRenderer
+from tinygrad.runtime.autogen.llvm import LLVMGetHostCPUName
 
 class ClangJITCompiler(Compiler):
   def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
@@ -11,8 +12,8 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    target += '-apple' if OSX else '-none'
-    args = ['-march=native', f'--target={target}-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    cpu = f"-mcpu={ctypes.string_at(LLVMGetHostCPUName()).decode()}"
+    args = [cpu, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From 12eafc9d944039363019700633a6efd63d7e1b5b Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 23:30:59 +0100
Subject: [PATCH 14/15] fix

---
 autogen_stubs.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autogen_stubs.sh b/autogen_stubs.sh
index 5c4e684e3d52e..6e444cdd0d766 100755
--- a/autogen_stubs.sh
+++ b/autogen_stubs.sh
@@ -441,6 +441,7 @@ elif [ "$1" == "qcom" ]; then generate_qcom
 elif [ "$1" == "io_uring" ]; then generate_io_uring
 elif [ "$1" == "libc" ]; then generate_libc
 elif [ "$1" == "llvm" ]; then generate_llvm
+elif [ "$1" == "cpu" ]; then generate_llvm
 elif [ "$1" == "kgsl" ]; then generate_kgsl
 elif [ "$1" == "adreno" ]; then generate_adreno
 elif [ "$1" == "pci" ]; then generate_pci

From d3d3e87e51c584b5c772fddea7d8f5e44071aa0a Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Mon, 16 Jun 2025 23:45:26 +0100
Subject: [PATCH 15/15] that was dumb, this should do

---
 autogen_stubs.sh            | 1 -
 tinygrad/runtime/ops_cpu.py | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/autogen_stubs.sh b/autogen_stubs.sh
index 6e444cdd0d766..5c4e684e3d52e 100755
--- a/autogen_stubs.sh
+++ b/autogen_stubs.sh
@@ -441,7 +441,6 @@ elif [ "$1" == "qcom" ]; then generate_qcom
 elif [ "$1" == "io_uring" ]; then generate_io_uring
 elif [ "$1" == "libc" ]; then generate_libc
 elif [ "$1" == "llvm" ]; then generate_llvm
-elif [ "$1" == "cpu" ]; then generate_llvm
 elif [ "$1" == "kgsl" ]; then generate_kgsl
 elif [ "$1" == "adreno" ]; then generate_adreno
 elif [ "$1" == "pci" ]; then generate_pci
diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index 97b115bbc2bfa..3d677a12fe914 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -3,7 +3,6 @@
 from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
 from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.renderer.cstyle import ClangRenderer
-from tinygrad.runtime.autogen.llvm import LLVMGetHostCPUName
 
 class ClangJITCompiler(Compiler):
   def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
@@ -12,8 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    cpu = f"-mcpu={ctypes.string_at(LLVMGetHostCPUName()).decode()}"
-    args = [cpu, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)