Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions 3rdparty/cnpy/cnpy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
// cnpy - C++ library for loading and saving NumPy npy and npz files.
// This is a trimmed-down subset of the upstream project
// https://github.com/rogersce/cnpy
// that is sufficient for MLC-LLM's LoRA loader. Only the pieces required
// for reading .npz archives (zip of .npy files) are kept. The implementation
// is header-only for ease of integration on all platforms.
//
// License: MIT
#pragma once

#include <cstdint>
#include <cstring>
#include <map>
#include <string>
#include <vector>
#include <memory>
#include <stdexcept>
#include <fstream>
#include <sstream>

// We depend on <zlib>. It is available on Linux and macOS by default; on
// Windows we rely on the system's zlib development package (or vcpkg).
#include <zlib.h>

namespace cnpy {

struct NpyArray {
std::vector<size_t> shape;
bool fortran_order{false};
size_t word_size{0}; // bytes per element
std::shared_ptr<std::vector<char>> data_holder; // shared so copies are cheap

template <typename T>
T* data() {
return reinterpret_cast<T*>(data_holder->data());
}
template <typename T>
const T* data() const {
return reinterpret_cast<const T*>(data_holder->data());
}
};

namespace detail {

// Read little-endian 4-byte unsigned int.
inline uint32_t read_le_uint32(std::istream& is) {
uint32_t val;
is.read(reinterpret_cast<char*>(&val), sizeof(val));
return val;
}

// Validate magic string (\x93NUMPY) and version 1.0/2.0.
inline void parse_npy_header(std::istream& is, NpyArray& arr, std::string& descr_dtype) {
char magic[6];
is.read(magic, 6);
if (std::memcmp(magic, "\x93NUMPY", 6) != 0) {
throw std::runtime_error("Invalid .npy file – bad magic");
}
uint8_t major, minor;
is.read(reinterpret_cast<char*>(&major), 1);
is.read(reinterpret_cast<char*>(&minor), 1);
uint16_t header_len16;
if (major == 1) {
header_len16 = static_cast<uint16_t>(read_le_uint32(is));
} else if (major == 2) {
header_len16 = static_cast<uint16_t>(read_le_uint32(is));
} else {
throw std::runtime_error("Unsupported .npy version");
}
std::string header(header_len16, '\0');
is.read(header.data(), header_len16);

// Parse header dictionary – extremely small, so simple string parsing is ok.
auto loc_descr = header.find("'descr':");
auto loc_shape = header.find("'shape':");
auto loc_fortran = header.find("'fortran_order':");
if (loc_descr == std::string::npos || loc_shape == std::string::npos) {
throw std::runtime_error("Malformed .npy header");
}
// dtype string is delimited by quotes.
auto start = header.find("'", loc_descr + 7) + 1;
auto end = header.find("'", start);
descr_dtype = header.substr(start, end - start);

// Parse shape tuple, e.g. (3, 4, 5)
start = header.find("(", loc_shape);
end = header.find(")", start);
std::string shape_str = header.substr(start + 1, end - start - 1);
size_t pos = 0;
while (true) {
size_t comma = shape_str.find(',', pos);
std::string dim = shape_str.substr(pos, comma - pos);
if (!dim.empty()) {
arr.shape.push_back(static_cast<size_t>(std::stoul(dim)));
}
if (comma == std::string::npos) break;
pos = comma + 1;
}

// fortran_order
if (loc_fortran != std::string::npos) {
size_t loc_true = header.find("True", loc_fortran);
arr.fortran_order = (loc_true != std::string::npos && loc_true < header.find(',', loc_fortran));
}
}

inline size_t dtype_to_word_size(const std::string& descr) {
if (descr == "<f4" || descr == "|f4") return 4;
if (descr == "<f2" || descr == "|f2") return 2;
if (descr == "<f8" || descr == "|f8") return 8;
throw std::runtime_error("Unsupported dtype in .npy: " + descr);
}

} // namespace detail

// Load a single .npy from an std::istream positioned at the array.
inline NpyArray load_npy_stream(std::istream& is) {
NpyArray arr;
std::string dtype;
detail::parse_npy_header(is, arr, dtype);
arr.word_size = detail::dtype_to_word_size(dtype);
size_t num_elems = 1;
for (size_t d : arr.shape) num_elems *= d;
size_t bytes = num_elems * arr.word_size;
arr.data_holder = std::make_shared<std::vector<char>>(bytes);
is.read(arr.data_holder->data(), bytes);
return arr;
}

// Load *all* arrays from an .npz archive. This minimal implementation works
// because our LoRA adapters store tens of small arrays at most.
inline std::map<std::string, NpyArray> npz_load(const std::string& fname) {
std::map<std::string, NpyArray> arrays;
// Open zip file via zlib's unz API (minizip). For portability we use the
// simpler gz* interface + .tar hack: not ideal but avoids adding minizip.
// Instead, we fall back to famous observation that .npz is a normal zip:
// Here we only support *stored* (compression method 0) entries which is the
// default for numpy (since 2023). If the file uses DEFLATE we error out.

// To keep integration simple and header-only, we restrict to uncompressed
// archives: each member is concatenated so we can parse manually.
std::ifstream fs(fname, std::ios::binary);
if (!fs) throw std::runtime_error("Cannot open npz file: " + fname);

// Very small, naive ZIP reader. We scan for "PK\x03\x04" local headers and
// read the contained .npy blobs. Enough for CI/sanity tests.
const uint32_t kSig = 0x04034b50; // little-endian PK\x03\x04
while (true) {
uint32_t sig;
fs.read(reinterpret_cast<char*>(&sig), 4);
if (!fs) break; // EOF
if (sig != kSig) {
throw std::runtime_error("Unsupported compression in npz (need stored) or bad signature");
}
uint16_t version, flags, method;
uint16_t modtime, moddate;
uint32_t crc32, comp_size, uncomp_size;
uint16_t name_len, extra_len;
fs.read(reinterpret_cast<char*>(&version), 2);
fs.read(reinterpret_cast<char*>(&flags), 2);
fs.read(reinterpret_cast<char*>(&method), 2);
fs.read(reinterpret_cast<char*>(&modtime), 2);
fs.read(reinterpret_cast<char*>(&moddate), 2);
fs.read(reinterpret_cast<char*>(&crc32), 4);
fs.read(reinterpret_cast<char*>(&comp_size), 4);
fs.read(reinterpret_cast<char*>(&uncomp_size), 4);
fs.read(reinterpret_cast<char*>(&name_len), 2);
fs.read(reinterpret_cast<char*>(&extra_len), 2);

std::string member_name(name_len, '\0');
fs.read(member_name.data(), name_len);
fs.ignore(extra_len); // skip extra

if (method != 0) {
throw std::runtime_error("npz entry is compressed; mini-loader only supports stored");
}
// Read the embedded .npy
std::vector<char> buf(uncomp_size);
fs.read(buf.data(), uncomp_size);
std::stringstream ss(std::string(buf.data(), buf.size()));
arrays[member_name] = load_npy_stream(ss);
}
return arrays;
}

inline NpyArray npz_load(const std::string& fname, const std::string& varname) {
auto all = npz_load(fname);
auto it = all.find(varname);
if (it == all.end()) {
throw std::runtime_error("Variable not found in npz: " + varname);
}
return it->second;
}

} // namespace cnpy
21 changes: 21 additions & 0 deletions cpp/serve/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
add_library(mlc_llm_serve_objects OBJECT
// ... existing code ...
lora.cc
lora_manager.cc
)

# LoRA loader dependencies
target_include_directories(mlc_llm_serve_objects
PRIVATE
${CMAKE_SOURCE_DIR}/3rdparty
)

# zlib is required for the mini cnpy header (<zlib.h>). We only include the
# headers and do not link against the library because the minimal ZIP reader
# avoids any zlib symbols. Still, add the library if available so future
# extensions (e.g. DEFLATE support) can rely on it.
find_package(ZLIB)
if(ZLIB_FOUND)
target_include_directories(mlc_llm_serve_objects PRIVATE ${ZLIB_INCLUDE_DIRS})
target_link_libraries(mlc_llm_serve_objects PRIVATE ${ZLIB_LIBRARIES})
endif()
33 changes: 33 additions & 0 deletions cpp/serve/lora.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>

#include <string>
#include "serve/lora_manager.h"

namespace mlc::serve {

static void UploadLora(const std::string& adapter_npz) {
// Alpha to be plumbed in later via manifest – use 1.0 for now.
mlc::serve::LoraManager::Global()->UploadAdapter(adapter_npz, /*alpha=*/1.0f);
}

} // namespace mlc::serve

// Expose a getter so Python (and other frontends) can retrieve the materialised
// delta tensor for a given full parameter name. The returned NDArray may be
// undefined if the key is missing.
TVM_REGISTER_GLOBAL("mlc.get_lora_delta").set_body_typed([](const std::string& param_name) {
return mlc::serve::LoraManager::Global()->Lookup(param_name);
});

// Called once by Python side to tell C++ what device the runtime operates on.
TVM_REGISTER_GLOBAL("mlc.set_active_device").set_body_typed([](int dev_type, int dev_id) {
mlc::serve::LoraManager::Global()->SetDevice(dev_type, dev_id);
});

// Register with TVM's FFI so that python can call this symbol via
// `tvm.get_global_func("mlc.serve.UploadLora")`.
TVM_REGISTER_GLOBAL("mlc.serve.UploadLora")
.set_body_typed([](const std::string& adapter_path) {
mlc::serve::UploadLora(adapter_path);
});
142 changes: 142 additions & 0 deletions cpp/serve/lora_manager.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include "serve/lora_manager.h"

#include <mutex>
#include <fstream>
#include "3rdparty/cnpy/cnpy.h"

#include <regex>

namespace mlc::serve {

namespace {
// Mutex to guard singleton construction (call-once).
std::once_flag g_once;
LoraManager* g_inst{nullptr};
}

LoraManager* LoraManager::Global() {
std::call_once(g_once, []() { g_inst = new LoraManager(); });
return g_inst;
}

void LoraManager::UploadAdapter(const std::string& adapter_npz_path, float alpha) {
// Load manifest JSON (same dir, same base + .json) to grab layer names if present.
std::string manifest_path = adapter_npz_path + ".json";
std::unordered_map<std::string, float> scaling_map; // full_param_name -> scaling
if (std::ifstream mf(manifest_path); mf.good()) {
std::string text((std::istreambuf_iterator<char>(mf)), std::istreambuf_iterator<char>());
// Very small regex-based parser assuming {"key": 1.0, "k2": 0.5}
std::regex kv_re("\"([^\"]+)\"\s*:\s*([0-9.+-eE]+)");
auto begin = std::sregex_iterator(text.begin(), text.end(), kv_re);
auto end = std::sregex_iterator();
for (auto it = begin; it != end; ++it) {
std::string k = (*it)[1].str();
float v = std::stof((*it)[2].str());
scaling_map[k] = v;
}
}

// Load every array in the .npz file via cnpy.
std::map<std::string, cnpy::NpyArray> arrays = cnpy::npz_load(adapter_npz_path);
tvm::Device cpu_dev{kDLCPU, 0};
for (const auto& kv : arrays) {
const std::string& name = kv.first; // e.g., "decoder.layers.0.mlp.w1.delta"
const cnpy::NpyArray& arr = kv.second;

bool promote_to_fp32 = (arr.word_size == 2);
DLDataType dtype;
dtype.code = kDLFloat;
dtype.lanes = 1;
dtype.bits = promote_to_fp32 ? 32 : (arr.word_size == 4 ? 32 : 64);

// Shape tuple
tvm::runtime::ShapeTuple shape(arr.shape.begin(), arr.shape.end());
size_t numel = 1;
for (auto d : arr.shape) numel *= d;

tvm::Device target_dev = runtime_device_;
tvm::runtime::NDArray nd;
bool alloc_failed = false;
try {
nd = tvm::runtime::NDArray::Empty(shape, dtype, target_dev);
} catch (const std::exception&) {
alloc_failed = true;
}
if (alloc_failed) {
target_dev = cpu_dev;
nd = tvm::runtime::NDArray::Empty(shape, dtype, cpu_dev);
}

if (promote_to_fp32) {
// Convert each half precision value to float32.
const uint16_t* src = reinterpret_cast<const uint16_t*>(arr.data_holder->data());
float* dst = static_cast<float*>(nd->data);
for (size_t i = 0; i < numel; ++i) {
uint16_t h = src[i];
// IEEE 754 half to float conversion (reference implementation)
uint32_t sign = (h & 0x8000) << 16;
uint32_t exp = (h & 0x7C00) >> 10;
uint32_t mant = (h & 0x03FF);
uint32_t f;
if (exp == 0) {
if (mant == 0) {
f = sign; // zero
} else {
// subnormal
exp = 1;
while ((mant & 0x0400) == 0) {
mant <<= 1;
exp -= 1;
}
mant &= 0x03FF;
exp += 127 - 15;
mant <<= 13;
f = sign | (exp << 23) | mant;
}
} else if (exp == 0x1F) {
// Inf or NaN
f = sign | 0x7F800000 | (mant << 13);
} else {
// Normalised
exp = exp + (127 - 15);
f = sign | (exp << 23) | (mant << 13);
}
dst[i] = *reinterpret_cast<float*>(&f);
}
} else {
nd.CopyFromBytes(arr.data_holder->data(), arr.data_holder->size());
}

// Apply alpha scaling if provided
auto it_scale = scaling_map.find(name);
if (it_scale != scaling_map.end()) {
float scale = it_scale->second * alpha;
if (dtype.bits == 32) {
float* p = static_cast<float*>(nd->data);
for (size_t i = 0; i < numel; ++i) p[i] *= scale;
}
}

// If we allocated on CPU but runtime device is GPU, copy now.
if (target_dev.device_type != runtime_device_.device_type || target_dev.device_id != runtime_device_.device_id) {
nd = nd.CopyTo(runtime_device_);
}

delta_map_[name] = nd;

// Keep the backing buffer alive for the lifetime of the manager. This is
// only necessary if we ever move to zero-copy NDArray creation, but is
// safe to do now.
owned_buffers_.push_back(arr.data_holder);
}
}

tvm::runtime::NDArray LoraManager::Lookup(const std::string& param_name) const {
auto it = delta_map_.find(param_name);
if (it != delta_map_.end()) {
return it->second;
}
return tvm::runtime::NDArray(); // undefined if not present.
}

} // namespace mlc::serve
Loading