ttomsa · ttomsa · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -896,6 +896,7 @@ jobs:
         with:
           key: macos-${{ matrix.backend }}-minimal
           deps: testing_minimal
+          pydeps: "capstone"
           llvm: ${{ matrix.backend == 'llvm' && 'true' }}
       - name: Set env
         run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV
@@ -910,6 +911,9 @@ jobs:
       - name: Run macOS-specific unit test
         if: matrix.backend == 'cpu'
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
+      - name: fp16 test
+        if: matrix.backend == 'cpu'
+        run: DEBUG=7 python3 test/unit/test_cpu_fp16_alu.py TestFloat16Alu.test_cpu
 
 # ****** Windows Tests ******
 

diff --git a/test/unit/test_cpu_fp16_alu.py b/test/unit/test_cpu_fp16_alu.py
@@ -0,0 +1,14 @@
+import unittest
+from tinygrad import Tensor, dtypes
+from tinygrad.device import Device
+from tinygrad.helpers import OSX
+
+class TestFloat16Alu(unittest.TestCase):
+  @unittest.skipUnless(Device.DEFAULT == "CPU" and OSX, "")
+  def test_cpu(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    b = Tensor([1,2,3,4], dtype=dtypes.float16)
+    c = (a + b).realize()
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
@@ -193,7 +193,7 @@ class ClangRenderer(CStyleLanguage):
 
   # language options
   buffer_suffix = " restrict"
-  type_map = {dtypes.bool:"_Bool", dtypes.half:"__fp16"}
+  type_map = {dtypes.bool:"_Bool", dtypes.half:"_Float16"}
   code_for_op = {**({k:v for k,v in CStyleLanguage.code_for_op.items() if k not in [Ops.EXP2, Ops.SIN, Ops.LOG2]}),
                  Ops.SQRT: lambda x,dtype: f"__builtin_sqrt({x})" if dtype == dtypes.float64 else f"__builtin_sqrtf({x})"}
   # LLVM legalizes double => half cast on systems that don't support it natively (like x86 cpus without AVX512-FP16) into a compiler-rt libcall.

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
@@ -1,5 +1,5 @@
-import functools, platform, subprocess, sys
-from tinygrad.helpers import capstone_flatdump, getenv
+import functools, platform, subprocess, sys, ctypes
+from tinygrad.helpers import capstone_flatdump, getenv, OSX
 from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
 from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.renderer.cstyle import ClangRenderer
@@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)