From e45e36c33fb43ff2736c3161606529a67fac7037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= Date: Fri, 24 Apr 2026 20:52:48 +0400 Subject: [PATCH 1/4] Add Apple Silicon support Co-authored-by: Copilot --- MotionCorrection/CMakeLists.txt | 37 +++++++++++++++++++++++- MotionCorrection/src/cpp/Math/SIMD.h | 25 +++++++++++++++- MotionCorrection/src/cpp/Math/Scalar.h | 1 + kimodo/demo/app.py | 14 ++++++--- kimodo/demo/memory_manager.py | 28 ++++++++++++------ kimodo/model/llm2vec/llm2vec_wrapper.py | 38 +++++++++++++++---------- 6 files changed, 114 insertions(+), 29 deletions(-) diff --git a/MotionCorrection/CMakeLists.txt b/MotionCorrection/CMakeLists.txt index 35f8fa1..ec0e376 100644 --- a/MotionCorrection/CMakeLists.txt +++ b/MotionCorrection/CMakeLists.txt @@ -33,6 +33,34 @@ if(NOT Eigen3_FOUND) FetchContent_MakeAvailable(Eigen) endif() +# If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically +# since SSE intrinsics may not be available and SSE2Neon provides mappings. +find_package(SSE2Neon CONFIG QUIET) +if(NOT SSE2Neon_FOUND AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + message(STATUS "SSE2Neon not found; fetching from GitHub...") + include(FetchContent) + FetchContent_Declare( + SSE2Neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git + GIT_TAG v1.9.1 + ) + FetchContent_MakeAvailable(SSE2Neon) +endif() + +# Try to resolve the FetchContent checkout path if the variable isn't defined +if(NOT DEFINED SSE2Neon_SOURCE_DIR) + set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src") + if(EXISTS "${_possible}") + set(SSE2Neon_SOURCE_DIR "${_possible}") + message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}") + endif() +endif() + +if(DEFINED SSE2Neon_SOURCE_DIR) + message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}") + include_directories(${SSE2Neon_SOURCE_DIR}) +endif() + # Source files set(MATH_SOURCES src/cpp/Math/Matrix.cpp @@ -73,7 +101,14 @@ if(MSVC) else() # GCC/Clang flags (also applies to MinGW on Windows) # Enable SSE4.1 and AVX instructions for SIMD operations - target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx) + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + add_definitions(-DFORCE_INLINE=inline) + if(DEFINED SSE2Neon_SOURCE_DIR) + target_include_directories(motion_correction_cpp_base PUBLIC ${SSE2Neon_SOURCE_DIR}) + endif() + endif() endif() # Python bindings diff --git a/MotionCorrection/src/cpp/Math/SIMD.h b/MotionCorrection/src/cpp/Math/SIMD.h index dfc1ced..fc1aaa6 100644 --- a/MotionCorrection/src/cpp/Math/SIMD.h +++ b/MotionCorrection/src/cpp/Math/SIMD.h @@ -6,7 +6,30 @@ #pragma once #include -#include + +// On Apple ARM (Apple Silicon) use sse2neon to map SSE intrinsics to NEON. +#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) + #include "sse2neon.h" +#else + #include +#endif + +#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) +#ifndef _mm_permutevar_ps +static inline __m128 _mm_permutevar_ps(__m128 a, __m128i idx) +{ + float vals[4]; + int32_t indices[4]; + _mm_storeu_ps(vals, a); + _mm_storeu_si128((__m128i*)indices, idx); + float out0 = vals[indices[0] & 3]; + float out1 = vals[indices[1] & 3]; + float out2 = vals[indices[2] & 3]; + float out3 = vals[indices[3] & 3]; + return _mm_set_ps(out3, out2, out1, out0); +} +#endif +#endif namespace SIMD { diff --git a/MotionCorrection/src/cpp/Math/Scalar.h b/MotionCorrection/src/cpp/Math/Scalar.h index 425b759..426620f 100644 --- a/MotionCorrection/src/cpp/Math/Scalar.h +++ b/MotionCorrection/src/cpp/Math/Scalar.h @@ -12,6 +12,7 @@ #include #include +#include // // Scalar related methods diff --git a/kimodo/demo/app.py b/kimodo/demo/app.py index 3bb5766..38ae6fe 100644 --- a/kimodo/demo/app.py +++ b/kimodo/demo/app.py @@ -219,8 +219,11 @@ def prewarm_embedding_cache(self, model_name: str, model: object, custom_prompts s.last_prompt_embeddings = None s.last_prompt_lengths = None - torch.cuda.ipc_collect() - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() def build_constraint_tracks( self, client: viser.ClientHandle, skeleton: SkeletonBase @@ -647,8 +650,11 @@ def generate( session.last_prompt_lengths = None gc.collect() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() finally: self._generation_lock.release() diff --git a/kimodo/demo/memory_manager.py b/kimodo/demo/memory_manager.py index 5002f2d..aef2d8c 100644 --- a/kimodo/demo/memory_manager.py +++ b/kimodo/demo/memory_manager.py @@ -113,8 +113,11 @@ def purge_encoder_completely(self): self.encoder.unload() release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() self.log_memory_usage("RAM Reclamation Complete") def get_free_vram(self) -> int: @@ -136,8 +139,11 @@ def ensure_vram_capacity(self, required_bytes: int, device: str = "cuda:0", excl if not self.offload_enabled or "cpu" in device: return - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() release_system_memory() current_free = self.get_free_vram() @@ -200,8 +206,11 @@ def touch_and_move(self, name: str, device: str): if hasattr(model, "device"): model.device = device - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() release_system_memory() def offload_model(self, name: str): @@ -221,8 +230,11 @@ def offload_model(self, name: str): model.device = "cpu" release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() self.log_memory_usage(f"Offloaded '{name}'") def report_residency(self): diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py index 277f7dd..0c398ed 100644 --- a/kimodo/model/llm2vec/llm2vec_wrapper.py +++ b/kimodo/model/llm2vec/llm2vec_wrapper.py @@ -2,8 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 """LLM2Vec encoder wrapper for Kimodo text conditioning.""" -import os import gc +import platform +import os import numpy as np import torch from torch import nn @@ -23,7 +24,9 @@ def __init__( self.torch_dtype = getattr(torch, dtype) self.llm_dim = llm_dim # Update this path to where your model is actually located! - self.custom_dir = "/home/aero/kimodo/KIMODO-Meta3_llm2vec_NF4" + root_path = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir, os.pardir)) + self.custom_dir = os.path.abspath(os.path.join(root_path, "models", "KIMODO-Meta3_llm2vec_NF4")) + print(f"[LLM2VecEncoder] Initializing model from {self.custom_dir}...") print(f"[LLM2VecEncoder] Initialized (Waiting for first use to load weights)...") self.model = None @@ -41,11 +44,11 @@ def unload(self): ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass - elif platform.system() == "Windows": - from kimodo.demo.memory_manager import release_system_memory - release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() def reload(self): """Move from System RAM to VRAM.""" @@ -63,22 +66,27 @@ def reload(self): curr_device = self.get_device() if curr_device.type != "cuda": - print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...") - self.model.model.to("cuda:0") + if torch.backends.mps.is_available(): + print(f"[LLM2VecEncoder] Moving weights to GPU (mps)...") + self.model.model.to("mps") + else: + print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...") + self.model.model.to("cuda:0") gc.collect() - import platform + if platform.system() == "Linux": try: import ctypes ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass - elif platform.system() == "Windows": - from kimodo.demo.memory_manager import release_system_memory - release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() manager.log_memory_usage("Encoder Transfer Complete (RAM Reclaimed)") else: From 2404be784e196862bef17c1bc159442b4c7cf34d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= Date: Fri, 24 Apr 2026 20:54:52 +0400 Subject: [PATCH 2/4] Revert Windows memory management --- kimodo/model/llm2vec/llm2vec_wrapper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py index 0c398ed..9a3e333 100644 --- a/kimodo/model/llm2vec/llm2vec_wrapper.py +++ b/kimodo/model/llm2vec/llm2vec_wrapper.py @@ -37,13 +37,16 @@ def unload(self): print(f"[LLM2VecEncoder] Offloading 5.4GB model to System RAM...") self.model.model.to("cpu") gc.collect() - import platform if platform.system() == "Linux": try: import ctypes ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass + elif platform.system() == "Windows": + from kimodo.demo.memory_manager import release_system_memory + release_system_memory() + if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() From 8c454488ec21f31840b78c4c2d82ff7c808cfb2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= Date: Fri, 24 Apr 2026 21:02:51 +0400 Subject: [PATCH 3/4] Revert Windows memory management --- kimodo/model/llm2vec/llm2vec_wrapper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py index 9a3e333..9baa4f5 100644 --- a/kimodo/model/llm2vec/llm2vec_wrapper.py +++ b/kimodo/model/llm2vec/llm2vec_wrapper.py @@ -84,6 +84,9 @@ def reload(self): ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass + elif platform.system() == "Windows": + from kimodo.demo.memory_manager import release_system_memory + release_system_memory() if torch.cuda.is_available(): torch.cuda.empty_cache() From 6ac2610840749f0afe2ae997b842748d1480b7d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tar=C4=B1k=20Y=C4=B1lmaz?= Date: Fri, 24 Apr 2026 21:11:58 +0400 Subject: [PATCH 4/4] Enable SSE2Neon only on ARM64 Co-authored-by: Copilot --- MotionCorrection/CMakeLists.txt | 48 +++++++++++++++++---------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/MotionCorrection/CMakeLists.txt b/MotionCorrection/CMakeLists.txt index ec0e376..06c2980 100644 --- a/MotionCorrection/CMakeLists.txt +++ b/MotionCorrection/CMakeLists.txt @@ -33,32 +33,34 @@ if(NOT Eigen3_FOUND) FetchContent_MakeAvailable(Eigen) endif() -# If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically -# since SSE intrinsics may not be available and SSE2Neon provides mappings. -find_package(SSE2Neon CONFIG QUIET) -if(NOT SSE2Neon_FOUND AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") - message(STATUS "SSE2Neon not found; fetching from GitHub...") - include(FetchContent) - FetchContent_Declare( - SSE2Neon - GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git - GIT_TAG v1.9.1 - ) - FetchContent_MakeAvailable(SSE2Neon) -endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + # If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically + # since SSE intrinsics may not be available and SSE2Neon provides mappings. + find_package(SSE2Neon CONFIG QUIET) + if(NOT SSE2Neon_FOUND) + message(STATUS "SSE2Neon not found; fetching from GitHub...") + include(FetchContent) + FetchContent_Declare( + SSE2Neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git + GIT_TAG v1.9.1 + ) + FetchContent_MakeAvailable(SSE2Neon) + endif() -# Try to resolve the FetchContent checkout path if the variable isn't defined -if(NOT DEFINED SSE2Neon_SOURCE_DIR) - set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src") - if(EXISTS "${_possible}") - set(SSE2Neon_SOURCE_DIR "${_possible}") - message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}") + # Try to resolve the FetchContent checkout path if the variable isn't defined + if(NOT DEFINED SSE2Neon_SOURCE_DIR) + set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src") + if(EXISTS "${_possible}") + set(SSE2Neon_SOURCE_DIR "${_possible}") + message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}") + endif() endif() -endif() -if(DEFINED SSE2Neon_SOURCE_DIR) - message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}") - include_directories(${SSE2Neon_SOURCE_DIR}) + if(DEFINED SSE2Neon_SOURCE_DIR) + message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}") + include_directories(${SSE2Neon_SOURCE_DIR}) + endif() endif() # Source files