Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion MotionCorrection/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,36 @@ if(NOT Eigen3_FOUND)
FetchContent_MakeAvailable(Eigen)
endif()

if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
# If building on an ARM (aarch64/arm64) macOS, fetch SSE2Neon automatically
# since SSE intrinsics may not be available and SSE2Neon provides mappings.
find_package(SSE2Neon CONFIG QUIET)
if(NOT SSE2Neon_FOUND)
message(STATUS "SSE2Neon not found; fetching from GitHub...")
include(FetchContent)
FetchContent_Declare(
SSE2Neon
GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
GIT_TAG v1.9.1
)
FetchContent_MakeAvailable(SSE2Neon)
endif()

# Try to resolve the FetchContent checkout path if the variable isn't defined
if(NOT DEFINED SSE2Neon_SOURCE_DIR)
set(_possible "${CMAKE_BINARY_DIR}/_deps/sse2neon-src")
if(EXISTS "${_possible}")
set(SSE2Neon_SOURCE_DIR "${_possible}")
message(STATUS "Resolved SSE2Neon_SOURCE_DIR -> ${SSE2Neon_SOURCE_DIR}")
endif()
endif()

if(DEFINED SSE2Neon_SOURCE_DIR)
message(STATUS "Adding SSE2Neon include dir: ${SSE2Neon_SOURCE_DIR}")
include_directories(${SSE2Neon_SOURCE_DIR})
endif()
endif()

# Source files
set(MATH_SOURCES
src/cpp/Math/Matrix.cpp
Expand Down Expand Up @@ -76,7 +106,14 @@ if(MSVC)
else()
# GCC/Clang flags (also applies to MinGW on Windows)
# Enable SSE4.1 and AVX instructions for SIMD operations
target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
target_compile_options(motion_correction_cpp_base PRIVATE -Wall -Wextra -msse4.1 -mavx)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
add_definitions(-DFORCE_INLINE=inline)
if(DEFINED SSE2Neon_SOURCE_DIR)
target_include_directories(motion_correction_cpp_base PUBLIC ${SSE2Neon_SOURCE_DIR})
endif()
endif()
endif()

# Python bindings
Expand Down
25 changes: 24 additions & 1 deletion MotionCorrection/src/cpp/Math/SIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,30 @@
#pragma once

#include <stdint.h>
#include <immintrin.h>

// On Apple ARM (Apple Silicon) use sse2neon to map SSE intrinsics to NEON.
#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
#include "sse2neon.h"
#else
#include <immintrin.h>
#endif

#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
#ifndef _mm_permutevar_ps
static inline __m128 _mm_permutevar_ps(__m128 a, __m128i idx)
{
float vals[4];
int32_t indices[4];
_mm_storeu_ps(vals, a);
_mm_storeu_si128((__m128i*)indices, idx);
float out0 = vals[indices[0] & 3];
float out1 = vals[indices[1] & 3];
float out2 = vals[indices[2] & 3];
float out3 = vals[indices[3] & 3];
return _mm_set_ps(out3, out2, out1, out0);
}
#endif
#endif

namespace SIMD
{
Expand Down
1 change: 1 addition & 0 deletions MotionCorrection/src/cpp/Math/Scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <math.h>
#include <stdint.h>
#include <stdlib.h>

//
// Scalar related methods
Expand Down
14 changes: 10 additions & 4 deletions kimodo/demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,11 @@ def prewarm_embedding_cache(self, model_name: str, model: object, custom_prompts
s.last_prompt_embeddings = None
s.last_prompt_lengths = None

torch.cuda.ipc_collect()
torch.cuda.empty_cache()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()

def build_constraint_tracks(
self, client: viser.ClientHandle, skeleton: SkeletonBase
Expand Down Expand Up @@ -652,8 +655,11 @@ def generate(
session.last_prompt_lengths = None

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
finally:
self._generation_lock.release()

Expand Down
28 changes: 20 additions & 8 deletions kimodo/demo/memory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,11 @@ def purge_encoder_completely(self):
self.encoder.unload()

release_system_memory()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
self.log_memory_usage("RAM Reclamation Complete")

def get_free_vram(self) -> int:
Expand All @@ -136,8 +139,11 @@ def ensure_vram_capacity(self, required_bytes: int, device: str = "cuda:0", excl
if not self.offload_enabled or "cpu" in device:
return

torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
release_system_memory()

current_free = self.get_free_vram()
Expand Down Expand Up @@ -200,8 +206,11 @@ def touch_and_move(self, name: str, device: str):
if hasattr(model, "device"):
model.device = device

torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
release_system_memory()

def offload_model(self, name: str):
Expand All @@ -221,8 +230,11 @@ def offload_model(self, name: str):
model.device = "cpu"

release_system_memory()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
self.log_memory_usage(f"Offloaded '{name}'")

def report_residency(self):
Expand Down
45 changes: 31 additions & 14 deletions kimodo/model/llm2vec/llm2vec_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
"""LLM2Vec encoder wrapper for Kimodo text conditioning."""

import os
import gc
import platform
import os
import numpy as np
import torch
from torch import nn
Expand All @@ -22,8 +23,15 @@ def __init__(
super().__init__()
self.torch_dtype = getattr(torch, dtype)
self.llm_dim = llm_dim
# Update this path to where your model is actually located!
self.custom_dir = "D:\KIMODO-Meta3_llm2vec_NF4"

custom_path = r"path_to_your_Llama_text-encoders"
if os.path.exists(custom_path):
self.custom_dir = custom_path
else:
root_path = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir, os.pardir))
self.custom_dir = os.path.abspath(os.path.join(root_path, "models", "KIMODO-Meta3_llm2vec_NF4"))

print(f"[LLM2VecEncoder] Initializing model from {self.custom_dir}...")
print(f"[LLM2VecEncoder] Initialized (Waiting for first use to load weights)...")
self.model = None

Expand All @@ -34,7 +42,6 @@ def unload(self):
print(f"[LLM2VecEncoder] Offloading 5.4GB model to System RAM...")
self.model.model.to("cpu")
gc.collect()
import platform
if platform.system() == "Linux":
try:
import ctypes
Expand All @@ -44,8 +51,12 @@ def unload(self):
elif platform.system() == "Windows":
from kimodo.demo.memory_manager import release_system_memory
release_system_memory()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()

def reload(self):
"""Move from System RAM to VRAM."""
Expand All @@ -63,11 +74,15 @@ def reload(self):

curr_device = self.get_device()
if curr_device.type != "cuda":
print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...")
self.model.model.to("cuda:0")
if torch.backends.mps.is_available():
print(f"[LLM2VecEncoder] Moving weights to GPU (mps)...")
self.model.model.to("mps")
else:
print(f"[LLM2VecEncoder] Moving weights to GPU (cuda:0)...")
self.model.model.to("cuda:0")

gc.collect()
import platform

if platform.system() == "Linux":
try:
import ctypes
Expand All @@ -77,8 +92,12 @@ def reload(self):
elif platform.system() == "Windows":
from kimodo.demo.memory_manager import release_system_memory
release_system_memory()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()

manager.log_memory_usage("Encoder Transfer Complete (RAM Reclaimed)")
else:
Expand All @@ -94,8 +113,6 @@ def get_device(self):

def delete(self):
"""Reclaim RAM without deleting from disk unless absolutely necessary."""
# We no longer delete the model by default to avoid slow reloads.
# Just unload to CPU instead.
self.unload()

def __call__(self, text: list[str] | str):
Expand Down Expand Up @@ -124,4 +141,4 @@ def __call__(self, text: list[str] | str):
lengths = lengths[0]

encoded_text = torch.tensor(encoded_text).to(self.get_device())
return encoded_text, lengths
return encoded_text, lengths