diff --git a/MotionCorrection/CMakeLists.txt b/MotionCorrection/CMakeLists.txt index 0770898..15f4f7e 100644 --- a/MotionCorrection/CMakeLists.txt +++ b/MotionCorrection/CMakeLists.txt @@ -36,6 +36,36 @@ if(NOT Eigen3_FOUND) FetchContent_MakeAvailable(Eigen) endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + # If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically + # since SSE intrinsics may not be available and SSE2Neon provides mappings. + find_package(SSE2Neon CONFIG QUIET) + if(NOT SSE2Neon_FOUND) + message(STATUS "SSE2Neon not found; fetching from GitHub...") + include(FetchContent) + FetchContent_Declare( + SSE2Neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git + GIT_TAG v1.9.1 + ) + FetchContent_MakeAvailable(SSE2Neon) + endif() + + # Try to resolve the FetchContent checkout path if the variable isn't defined + if(NOT DEFINED SSE2Neon_SOURCE_DIR) + set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src") + if(EXISTS "${_possible}") + set(SSE2Neon_SOURCE_DIR "${_possible}") + message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}") + endif() + endif() + + if(DEFINED SSE2Neon_SOURCE_DIR) + message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}") + include_directories(${SSE2Neon_SOURCE_DIR}) + endif() +endif() + # Source files set(MATH_SOURCES src/cpp/Math/Matrix.cpp @@ -76,7 +106,14 @@ if(MSVC) else() # GCC/Clang flags (also applies to MinGW on Windows) # Enable SSE4.1 and AVX instructions for SIMD operations - target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx) + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + add_definitions(-DFORCE_INLINE=inline) + if(DEFINED SSE2Neon_SOURCE_DIR) + target_include_directories(motion_correction_cpp_base PUBLIC ${SSE2Neon_SOURCE_DIR}) + endif() + endif() endif() # Python bindings diff --git a/MotionCorrection/src/cpp/Math/SIMD.h b/MotionCorrection/src/cpp/Math/SIMD.h index dfc1ced..fc1aaa6 100644 --- a/MotionCorrection/src/cpp/Math/SIMD.h +++ b/MotionCorrection/src/cpp/Math/SIMD.h @@ -6,7 +6,30 @@ #pragma once #include -#include + +// On Apple ARM (Apple Silicon) use sse2neon to map SSE intrinsics to NEON. +#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) + #include "sse2neon.h" +#else + #include +#endif + +#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) +#ifndef _mm_permutevar_ps +static inline __m128 _mm_permutevar_ps(__m128 a, __m128i idx) +{ + float vals[4]; + int32_t indices[4]; + _mm_storeu_ps(vals, a); + _mm_storeu_si128((__m128i*)indices, idx); + float out0 = vals[indices[0] & 3]; + float out1 = vals[indices[1] & 3]; + float out2 = vals[indices[2] & 3]; + float out3 = vals[indices[3] & 3]; + return _mm_set_ps(out3, out2, out1, out0); +} +#endif +#endif namespace SIMD { diff --git a/MotionCorrection/src/cpp/Math/Scalar.h b/MotionCorrection/src/cpp/Math/Scalar.h index 425b759..426620f 100644 --- a/MotionCorrection/src/cpp/Math/Scalar.h +++ b/MotionCorrection/src/cpp/Math/Scalar.h @@ -12,6 +12,7 @@ #include #include +#include // // Scalar related methods diff --git a/kimodo/demo/app.py b/kimodo/demo/app.py index 5baa663..fb39d30 100644 --- a/kimodo/demo/app.py +++ b/kimodo/demo/app.py @@ -224,8 +224,11 @@ def prewarm_embedding_cache(self, model_name: str, model: object, custom_prompts s.last_prompt_embeddings = None s.last_prompt_lengths = None - torch.cuda.ipc_collect() - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() def build_constraint_tracks( self, client: viser.ClientHandle, skeleton: SkeletonBase @@ -652,8 +655,11 @@ def generate( session.last_prompt_lengths = None gc.collect() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() finally: self._generation_lock.release() diff --git a/kimodo/demo/memory_manager.py b/kimodo/demo/memory_manager.py index 5002f2d..aef2d8c 100644 --- a/kimodo/demo/memory_manager.py +++ b/kimodo/demo/memory_manager.py @@ -113,8 +113,11 @@ def purge_encoder_completely(self): self.encoder.unload() release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() self.log_memory_usage("RAM Reclamation Complete") def get_free_vram(self) -> int: @@ -136,8 +139,11 @@ def ensure_vram_capacity(self, required_bytes: int, device: str = "cuda:0", excl if not self.offload_enabled or "cpu" in device: return - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() release_system_memory() current_free = self.get_free_vram() @@ -200,8 +206,11 @@ def touch_and_move(self, name: str, device: str): if hasattr(model, "device"): model.device = device - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() release_system_memory() def offload_model(self, name: str): @@ -221,8 +230,11 @@ def offload_model(self, name: str): model.device = "cpu" release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() self.log_memory_usage(f"Offloaded '{name}'") def report_residency(self): diff --git a/kimodo/model/llm2vec/llm2vec_wrapper.py b/kimodo/model/llm2vec/llm2vec_wrapper.py index fba1b32..07c460f 100644 --- a/kimodo/model/llm2vec/llm2vec_wrapper.py +++ b/kimodo/model/llm2vec/llm2vec_wrapper.py @@ -2,8 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 """LLM2Vec encoder wrapper for Kimodo text conditioning.""" -import os import gc +import platform +import os import numpy as np import torch from torch import nn @@ -22,8 +23,15 @@ def __init__( super().__init__() self.torch_dtype = getattr(torch, dtype) self.llm_dim = llm_dim - # Update this path to where your model is actually located! - self.custom_dir = "D:\KIMODO-Meta3_llm2vec_NF4" + + custom_path = r"path_to_your_Llama_text-encoders" + if os.path.exists(custom_path): + self.custom_dir = custom_path + else: + root_path = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir, os.pardir)) + self.custom_dir = os.path.abspath(os.path.join(root_path, "models", "KIMODO-Meta3_llm2vec_NF4")) + + print(f"[LLM2VecEncoder] Initializing model from {self.custom_dir}...") print(f"[LLM2VecEncoder] Initialized (Waiting for first use to load weights)...") self.model = None @@ -34,7 +42,6 @@ def unload(self): print(f"[LLM2VecEncoder] Offloading 5.4GB model to System RAM...") self.model.model.to("cpu") gc.collect() - import platform if platform.system() == "Linux": try: import ctypes @@ -44,8 +51,12 @@ def unload(self): elif platform.system() == "Windows": from kimodo.demo.memory_manager import release_system_memory release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() def reload(self): """Move from System RAM to VRAM.""" @@ -63,11 +74,15 @@ def reload(self): curr_device = self.get_device() if curr_device.type != "cuda": - print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...") - self.model.model.to("cuda:0") + if torch.backends.mps.is_available(): + print(f"[LLM2VecEncoder] Moving weights to GPU (mps)...") + self.model.model.to("mps") + else: + print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...") + self.model.model.to("cuda:0") gc.collect() - import platform + if platform.system() == "Linux": try: import ctypes @@ -77,8 +92,12 @@ def reload(self): elif platform.system() == "Windows": from kimodo.demo.memory_manager import release_system_memory release_system_memory() - torch.cuda.empty_cache() - torch.cuda.ipc_collect() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() manager.log_memory_usage("Encoder Transfer Complete (RAM Reclaimed)") else: @@ -94,8 +113,6 @@ def get_device(self): def delete(self): """Reclaim RAM without deleting from disk unless absolutely necessary.""" - # We no longer delete the model by default to avoid slow reloads. - # Just unload to CPU instead. self.unload() def __call__(self, text: list[str] | str): @@ -124,4 +141,4 @@ def __call__(self, text: list[str] | str): lengths = lengths[0] encoded_text = torch.tensor(encoded_text).to(self.get_device()) - return encoded_text, lengths + return encoded_text, lengths \ No newline at end of file