Aero-Ex · Aero-Ex · Apr 27, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/MotionCorrection/CMakeLists.txt b/MotionCorrection/CMakeLists.txt
@@ -36,6 +36,36 @@ if(NOT Eigen3_FOUND)
     FetchContent_MakeAvailable(Eigen)
 endif()
 
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    # If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically
+    # since SSE intrinsics may not be available and SSE2Neon provides mappings.
+    find_package(SSE2Neon CONFIG QUIET)
+    if(NOT SSE2Neon_FOUND)
+        message(STATUS "SSE2Neon not found; fetching from GitHub...")
+        include(FetchContent)
+        FetchContent_Declare(
+            SSE2Neon
+            GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
+            GIT_TAG v1.9.1
+        )
+        FetchContent_MakeAvailable(SSE2Neon)
+    endif()
+
+    # Try to resolve the FetchContent checkout path if the variable isn't defined
+    if(NOT DEFINED SSE2Neon_SOURCE_DIR)
+        set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src")
+        if(EXISTS "${_possible}")
+            set(SSE2Neon_SOURCE_DIR "${_possible}")
+            message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}")
+        endif()
+    endif()
+
+    if(DEFINED SSE2Neon_SOURCE_DIR)
+        message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}")
+        include_directories(${SSE2Neon_SOURCE_DIR})
+    endif()
+endif()
+
 # Source files
 set(MATH_SOURCES
     src/cpp/Math/Matrix.cpp
@@ -76,7 +106,14 @@ if(MSVC)
 else()
     # GCC/Clang flags (also applies to MinGW on Windows)
     # Enable SSE4.1 and AVX instructions for SIMD operations
-    target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx)
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+        target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx)
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+        add_definitions(-DFORCE_INLINE=inline)
+        if(DEFINED SSE2Neon_SOURCE_DIR)
+            target_include_directories(motion_correction_cpp_base PUBLIC ${SSE2Neon_SOURCE_DIR})
+        endif()
+    endif()
 endif()
 
 # Python bindings

diff --git a/MotionCorrection/src/cpp/Math/SIMD.h b/MotionCorrection/src/cpp/Math/SIMD.h
@@ -6,7 +6,30 @@
 #pragma once
 
 #include <stdint.h>
-#include <immintrin.h>
+
+// On Apple ARM (Apple Silicon) use sse2neon to map SSE intrinsics to NEON.
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+    #include "sse2neon.h"
+#else
+    #include <immintrin.h>
+#endif
+
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#ifndef _mm_permutevar_ps
+static inline __m128 _mm_permutevar_ps(__m128 a, __m128i idx)
+{
+    float vals[4];
+    int32_t indices[4];
+    _mm_storeu_ps(vals, a);
+    _mm_storeu_si128((__m128i*)indices, idx);
+    float out0 = vals[indices[0] & 3];
+    float out1 = vals[indices[1] & 3];
+    float out2 = vals[indices[2] & 3];
+    float out3 = vals[indices[3] & 3];
+    return _mm_set_ps(out3, out2, out1, out0);
+}
+#endif
+#endif
 
 namespace SIMD
 {

diff --git a/MotionCorrection/src/cpp/Math/Scalar.h b/MotionCorrection/src/cpp/Math/Scalar.h
@@ -12,6 +12,7 @@
 
 #include <math.h>
 #include <stdint.h>
+#include <stdlib.h>
 
 //
 // Scalar related methods

diff --git a/kimodo/demo/app.py b/kimodo/demo/app.py
@@ -224,8 +224,11 @@ def prewarm_embedding_cache(self, model_name: str, model: object, custom_prompts
                     s.last_prompt_embeddings = None
                     s.last_prompt_lengths = None
 
-            torch.cuda.ipc_collect()
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
 
     def build_constraint_tracks(
         self, client: viser.ClientHandle, skeleton: SkeletonBase
@@ -652,8 +655,11 @@ def generate(
             session.last_prompt_lengths = None
 
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
         finally:
             self._generation_lock.release()
 

diff --git a/kimodo/demo/memory_manager.py b/kimodo/demo/memory_manager.py
@@ -113,8 +113,11 @@ def purge_encoder_completely(self):
                 self.encoder.unload()
 
             release_system_memory()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
             self.log_memory_usage("RAM Reclamation Complete")
 
     def get_free_vram(self) -> int:
@@ -136,8 +139,11 @@ def ensure_vram_capacity(self, required_bytes: int, device: str = "cuda:0", excl
         if not self.offload_enabled or "cpu" in device:
             return
 
-        torch.cuda.empty_cache()
-        torch.cuda.ipc_collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
         release_system_memory()
 
         current_free = self.get_free_vram()
@@ -200,8 +206,11 @@ def touch_and_move(self, name: str, device: str):
             if hasattr(model, "device"):
                 model.device = device
 
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
             release_system_memory()
 
     def offload_model(self, name: str):
@@ -221,8 +230,11 @@ def offload_model(self, name: str):
                     model.device = "cpu"
 
                 release_system_memory()
-                torch.cuda.empty_cache()
-                torch.cuda.ipc_collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                if torch.backends.mps.is_available():
+                    torch.mps.empty_cache()
                 self.log_memory_usage(f"Offloaded '{name}'")
 
     def report_residency(self):

diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py
@@ -2,8 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """LLM2Vec encoder wrapper for Kimodo text conditioning."""
 
-import os
 import gc
+import platform
+import os
 import numpy as np
 import torch
 from torch import nn
@@ -22,8 +23,15 @@ def __init__(
         super().__init__()
         self.torch_dtype = getattr(torch, dtype)
         self.llm_dim = llm_dim
-        # Update this path to where your model is actually located!
-        self.custom_dir = "D:\KIMODO-Meta3_llm2vec_NF4"
+
+        custom_path = r"path_to_your_Llama_text-encoders"
+        if os.path.exists(custom_path):
+            self.custom_dir = custom_path
+        else:
+            root_path = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir, os.pardir))
+            self.custom_dir = os.path.abspath(os.path.join(root_path, "models", "KIMODO-Meta3_llm2vec_NF4"))
+
+        print(f"[LLM2VecEncoder] Initializing model from {self.custom_dir}...")
         print(f"[LLM2VecEncoder] Initialized (Waiting for first use to load weights)...")
         self.model = None
 
@@ -34,7 +42,6 @@ def unload(self):
                 print(f"[LLM2VecEncoder] Offloading 5.4GB model to System RAM...")
                 self.model.model.to("cpu")
                 gc.collect()
-                import platform
                 if platform.system() == "Linux":
                     try:
                         import ctypes
@@ -44,8 +51,12 @@ def unload(self):
                 elif platform.system() == "Windows":
                     from kimodo.demo.memory_manager import release_system_memory
                     release_system_memory()
-                torch.cuda.empty_cache()
-                torch.cuda.ipc_collect()
+
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                if torch.backends.mps.is_available():
+                    torch.mps.empty_cache()
 
     def reload(self):
         """Move from System RAM to VRAM."""
@@ -63,11 +74,15 @@ def reload(self):
 
         curr_device = self.get_device()
         if curr_device.type != "cuda":
-            print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...")
-            self.model.model.to("cuda:0")
+            if torch.backends.mps.is_available():
+                print(f"[LLM2VecEncoder] Moving weights to GPU (mps)...")
+                self.model.model.to("mps")
+            else:
+                print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...")
+                self.model.model.to("cuda:0")
 
             gc.collect()
-            import platform
+
             if platform.system() == "Linux":
                 try:
                     import ctypes
@@ -77,8 +92,12 @@ def reload(self):
             elif platform.system() == "Windows":
                 from kimodo.demo.memory_manager import release_system_memory
                 release_system_memory()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
 
             manager.log_memory_usage("Encoder Transfer Complete (RAM Reclaimed)")
         else:
@@ -94,8 +113,6 @@ def get_device(self):
 
     def delete(self):
         """Reclaim RAM without deleting from disk unless absolutely necessary."""
-        # We no longer delete the model by default to avoid slow reloads.
-        # Just unload to CPU instead.
         self.unload()
 
     def __call__(self, text: list[str] | str):
@@ -124,4 +141,4 @@ def __call__(self, text: list[str] | str):
             lengths = lengths[0]
 
         encoded_text = torch.tensor(encoded_text).to(self.get_device())
-        return encoded_text, lengths
+        return encoded_text, lengths