From e45e36c33fb43ff2736c3161606529a67fac7037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= <tarikyilmaz@yandex.com.tr>
Date: Fri, 24 Apr 2026 20:52:48 +0400
Subject: [PATCH 1/4] Add Apple Silicon support

Co-authored-by: Copilot <copilot@github.com>
---
 MotionCorrection/CMakeLists.txt         | 37 +++++++++++++++++++++++-
 MotionCorrection/src/cpp/Math/SIMD.h    | 25 +++++++++++++++-
 MotionCorrection/src/cpp/Math/Scalar.h  |  1 +
 kimodo/demo/app.py                      | 14 ++++++---
 kimodo/demo/memory_manager.py           | 28 ++++++++++++------
 kimodo/model/llm2vec/llm2vec_wrapper.py | 38 +++++++++++++++----------
 6 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/MotionCorrection/CMakeLists.txt b/MotionCorrection/CMakeLists.txt
index 35f8fa1..ec0e376 100644
--- a/MotionCorrection/CMakeLists.txt
+++ b/MotionCorrection/CMakeLists.txt
@@ -33,6 +33,34 @@ if(NOT Eigen3_FOUND)
     FetchContent_MakeAvailable(Eigen)
 endif()
 
+# If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically
+# since SSE intrinsics may not be available and SSE2Neon provides mappings.
+find_package(SSE2Neon CONFIG QUIET)
+if(NOT SSE2Neon_FOUND AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    message(STATUS "SSE2Neon not found; fetching from GitHub...")
+    include(FetchContent)
+    FetchContent_Declare(
+        SSE2Neon
+        GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
+        GIT_TAG v1.9.1
+    )
+    FetchContent_MakeAvailable(SSE2Neon)
+endif()
+
+# Try to resolve the FetchContent checkout path if the variable isn't defined
+if(NOT DEFINED SSE2Neon_SOURCE_DIR)
+    set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src")
+    if(EXISTS "${_possible}")
+        set(SSE2Neon_SOURCE_DIR "${_possible}")
+        message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}")
+    endif()
+endif()
+
+if(DEFINED SSE2Neon_SOURCE_DIR)
+    message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}")
+    include_directories(${SSE2Neon_SOURCE_DIR})
+endif()
+
 # Source files
 set(MATH_SOURCES
     src/cpp/Math/Matrix.cpp
@@ -73,7 +101,14 @@ if(MSVC)
 else()
     # GCC/Clang flags (also applies to MinGW on Windows)
     # Enable SSE4.1 and AVX instructions for SIMD operations
-    target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx)
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+        target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx)
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+        add_definitions(-DFORCE_INLINE=inline)
+        if(DEFINED SSE2Neon_SOURCE_DIR)
+            target_include_directories(motion_correction_cpp_base PUBLIC ${SSE2Neon_SOURCE_DIR})
+        endif()
+    endif()
 endif()
 
 # Python bindings
diff --git a/MotionCorrection/src/cpp/Math/SIMD.h b/MotionCorrection/src/cpp/Math/SIMD.h
index dfc1ced..fc1aaa6 100644
--- a/MotionCorrection/src/cpp/Math/SIMD.h
+++ b/MotionCorrection/src/cpp/Math/SIMD.h
@@ -6,7 +6,30 @@
 #pragma once
 
 #include <stdint.h>
-#include <immintrin.h>
+
+// On Apple ARM (Apple Silicon) use sse2neon to map SSE intrinsics to NEON.
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+    #include "sse2neon.h"
+#else
+    #include <immintrin.h>
+#endif
+
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#ifndef _mm_permutevar_ps
+static inline __m128 _mm_permutevar_ps(__m128 a, __m128i idx)
+{
+    float vals[4];
+    int32_t indices[4];
+    _mm_storeu_ps(vals, a);
+    _mm_storeu_si128((__m128i*)indices, idx);
+    float out0 = vals[indices[0] & 3];
+    float out1 = vals[indices[1] & 3];
+    float out2 = vals[indices[2] & 3];
+    float out3 = vals[indices[3] & 3];
+    return _mm_set_ps(out3, out2, out1, out0);
+}
+#endif
+#endif
 
 namespace SIMD
 {
diff --git a/MotionCorrection/src/cpp/Math/Scalar.h b/MotionCorrection/src/cpp/Math/Scalar.h
index 425b759..426620f 100644
--- a/MotionCorrection/src/cpp/Math/Scalar.h
+++ b/MotionCorrection/src/cpp/Math/Scalar.h
@@ -12,6 +12,7 @@
 
 #include <math.h>
 #include <stdint.h>
+#include <stdlib.h>
 
 //
 // Scalar related methods
diff --git a/kimodo/demo/app.py b/kimodo/demo/app.py
index 3bb5766..38ae6fe 100644
--- a/kimodo/demo/app.py
+++ b/kimodo/demo/app.py
@@ -219,8 +219,11 @@ def prewarm_embedding_cache(self, model_name: str, model: object, custom_prompts
                     s.last_prompt_embeddings = None
                     s.last_prompt_lengths = None
             
-            torch.cuda.ipc_collect()
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
 
     def build_constraint_tracks(
         self, client: viser.ClientHandle, skeleton: SkeletonBase
@@ -647,8 +650,11 @@ def generate(
             session.last_prompt_lengths = None
             
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
         finally:
             self._generation_lock.release()
 
diff --git a/kimodo/demo/memory_manager.py b/kimodo/demo/memory_manager.py
index 5002f2d..aef2d8c 100644
--- a/kimodo/demo/memory_manager.py
+++ b/kimodo/demo/memory_manager.py
@@ -113,8 +113,11 @@ def purge_encoder_completely(self):
                 self.encoder.unload()
             
             release_system_memory()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
             self.log_memory_usage("RAM Reclamation Complete")
 
     def get_free_vram(self) -> int:
@@ -136,8 +139,11 @@ def ensure_vram_capacity(self, required_bytes: int, device: str = "cuda:0", excl
         if not self.offload_enabled or "cpu" in device:
             return
 
-        torch.cuda.empty_cache()
-        torch.cuda.ipc_collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
         release_system_memory()
 
         current_free = self.get_free_vram()
@@ -200,8 +206,11 @@ def touch_and_move(self, name: str, device: str):
             if hasattr(model, "device"):
                 model.device = device
 
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
             release_system_memory()
 
     def offload_model(self, name: str):
@@ -221,8 +230,11 @@ def offload_model(self, name: str):
                     model.device = "cpu"
                 
                 release_system_memory()
-                torch.cuda.empty_cache()
-                torch.cuda.ipc_collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                if torch.backends.mps.is_available():
+                    torch.mps.empty_cache()
                 self.log_memory_usage(f"Offloaded '{name}'")
 
     def report_residency(self):
diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py
index 277f7dd..0c398ed 100644
--- a/kimodo/model/llm2vec/llm2vec_wrapper.py
+++ b/kimodo/model/llm2vec/llm2vec_wrapper.py
@@ -2,8 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """LLM2Vec encoder wrapper for Kimodo text conditioning."""
 
-import os
 import gc
+import platform
+import os
 import numpy as np
 import torch
 from torch import nn
@@ -23,7 +24,9 @@ def __init__(
         self.torch_dtype = getattr(torch, dtype)
         self.llm_dim = llm_dim
         # Update this path to where your model is actually located!
-        self.custom_dir = "/home/aero/kimodo/KIMODO-Meta3_llm2vec_NF4"
+        root_path = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir, os.pardir))
+        self.custom_dir = os.path.abspath(os.path.join(root_path, "models", "KIMODO-Meta3_llm2vec_NF4"))
+        print(f"[LLM2VecEncoder] Initializing model from {self.custom_dir}...")
         print(f"[LLM2VecEncoder] Initialized (Waiting for first use to load weights)...")
         self.model = None
 
@@ -41,11 +44,11 @@ def unload(self):
                         ctypes.CDLL("libc.so.6").malloc_trim(0)
                     except Exception:
                         pass
-                elif platform.system() == "Windows":
-                    from kimodo.demo.memory_manager import release_system_memory
-                    release_system_memory()
-                torch.cuda.empty_cache()
-                torch.cuda.ipc_collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                if torch.backends.mps.is_available():
+                    torch.mps.empty_cache()
 
     def reload(self):
         """Move from System RAM to VRAM."""
@@ -63,22 +66,27 @@ def reload(self):
 
         curr_device = self.get_device()
         if curr_device.type != "cuda":
-            print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...")
-            self.model.model.to("cuda:0")
+            if torch.backends.mps.is_available():
+                print(f"[LLM2VecEncoder] Moving weights to GPU (mps)...")
+                self.model.model.to("mps")
+            else:
+                print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...")
+                self.model.model.to("cuda:0")
             
             gc.collect()
-            import platform
+            
             if platform.system() == "Linux":
                 try:
                     import ctypes
                     ctypes.CDLL("libc.so.6").malloc_trim(0)
                 except Exception:
                     pass
-            elif platform.system() == "Windows":
-                from kimodo.demo.memory_manager import release_system_memory
-                release_system_memory()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
             
             manager.log_memory_usage("Encoder Transfer Complete (RAM Reclaimed)")
         else:

From 2404be784e196862bef17c1bc159442b4c7cf34d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= <tarikyilmaz@yandex.com.tr>
Date: Fri, 24 Apr 2026 20:54:52 +0400
Subject: [PATCH 2/4] Revert Windows memory management

---
 kimodo/model/llm2vec/llm2vec_wrapper.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py
index 0c398ed..9a3e333 100644
--- a/kimodo/model/llm2vec/llm2vec_wrapper.py
+++ b/kimodo/model/llm2vec/llm2vec_wrapper.py
@@ -37,13 +37,16 @@ def unload(self):
                 print(f"[LLM2VecEncoder] Offloading 5.4GB model to System RAM...")
                 self.model.model.to("cpu")
                 gc.collect()
-                import platform
                 if platform.system() == "Linux":
                     try:
                         import ctypes
                         ctypes.CDLL("libc.so.6").malloc_trim(0)
                     except Exception:
                         pass
+                elif platform.system() == "Windows":
+                    from kimodo.demo.memory_manager import release_system_memory
+                    release_system_memory()
+
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                     torch.cuda.ipc_collect()

From 8c454488ec21f31840b78c4c2d82ff7c808cfb2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= <tarikyilmaz@yandex.com.tr>
Date: Fri, 24 Apr 2026 21:02:51 +0400
Subject: [PATCH 3/4] Revert Windows memory management

---
 kimodo/model/llm2vec/llm2vec_wrapper.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py
index 9a3e333..9baa4f5 100644
--- a/kimodo/model/llm2vec/llm2vec_wrapper.py
+++ b/kimodo/model/llm2vec/llm2vec_wrapper.py
@@ -84,6 +84,9 @@ def reload(self):
                     ctypes.CDLL("libc.so.6").malloc_trim(0)
                 except Exception:
                     pass
+            elif platform.system() == "Windows":
+                from kimodo.demo.memory_manager import release_system_memory
+                release_system_memory()
 
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()

From 6ac2610840749f0afe2ae997b842748d1480b7d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= <tarikyilmaz@yandex.com.tr>
Date: Fri, 24 Apr 2026 21:11:58 +0400
Subject: [PATCH 4/4] Enable SSE2Neon only on ARM64

Co-authored-by: Copilot <copilot@github.com>
---
 MotionCorrection/CMakeLists.txt | 48 +++++++++++++++++----------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/MotionCorrection/CMakeLists.txt b/MotionCorrection/CMakeLists.txt
index ec0e376..06c2980 100644
--- a/MotionCorrection/CMakeLists.txt
+++ b/MotionCorrection/CMakeLists.txt
@@ -33,32 +33,34 @@ if(NOT Eigen3_FOUND)
     FetchContent_MakeAvailable(Eigen)
 endif()
 
-# If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically
-# since SSE intrinsics may not be available and SSE2Neon provides mappings.
-find_package(SSE2Neon CONFIG QUIET)
-if(NOT SSE2Neon_FOUND AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
-    message(STATUS "SSE2Neon not found; fetching from GitHub...")
-    include(FetchContent)
-    FetchContent_Declare(
-        SSE2Neon
-        GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
-        GIT_TAG v1.9.1
-    )
-    FetchContent_MakeAvailable(SSE2Neon)
-endif()
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    # If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically
+    # since SSE intrinsics may not be available and SSE2Neon provides mappings.
+    find_package(SSE2Neon CONFIG QUIET)
+    if(NOT SSE2Neon_FOUND)
+        message(STATUS "SSE2Neon not found; fetching from GitHub...")
+        include(FetchContent)
+        FetchContent_Declare(
+            SSE2Neon
+            GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
+            GIT_TAG v1.9.1
+        )
+        FetchContent_MakeAvailable(SSE2Neon)
+    endif()
 
-# Try to resolve the FetchContent checkout path if the variable isn't defined
-if(NOT DEFINED SSE2Neon_SOURCE_DIR)
-    set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src")
-    if(EXISTS "${_possible}")
-        set(SSE2Neon_SOURCE_DIR "${_possible}")
-        message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}")
+    # Try to resolve the FetchContent checkout path if the variable isn't defined
+    if(NOT DEFINED SSE2Neon_SOURCE_DIR)
+        set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src")
+        if(EXISTS "${_possible}")
+            set(SSE2Neon_SOURCE_DIR "${_possible}")
+            message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}")
+        endif()
     endif()
-endif()
 
-if(DEFINED SSE2Neon_SOURCE_DIR)
-    message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}")
-    include_directories(${SSE2Neon_SOURCE_DIR})
+    if(DEFINED SSE2Neon_SOURCE_DIR)
+        message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}")
+        include_directories(${SSE2Neon_SOURCE_DIR})
+    endif()
 endif()
 
 # Source files