mlc-ai · MagellaX · Jul 2, 2025 · Nov 11, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/3rdparty/cnpy/cnpy.h b/3rdparty/cnpy/cnpy.h
@@ -0,0 +1,195 @@
+// cnpy - C++ library for loading and saving NumPy npy and npz files.
+// This is a trimmed-down subset of the upstream project
+//   https://github.com/rogersce/cnpy
+// that is sufficient for MLC-LLM's LoRA loader.  Only the pieces required
+// for reading .npz archives (zip of .npy files) are kept.  The implementation
+// is header-only for ease of integration on all platforms.
+//
+// License: MIT
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+#include <stdexcept>
+#include <fstream>
+#include <sstream>
+
+// We depend on <zlib>.  It is available on Linux and macOS by default; on
+// Windows we rely on the system's zlib development package (or vcpkg).
+#include <zlib.h>
+
+namespace cnpy {
+
+struct NpyArray {
+  std::vector<size_t> shape;
+  bool fortran_order{false};
+  size_t word_size{0};   // bytes per element
+  std::shared_ptr<std::vector<char>> data_holder;  // shared so copies are cheap
+
+  template <typename T>
+  T* data() {
+    return reinterpret_cast<T*>(data_holder->data());
+  }
+  template <typename T>
+  const T* data() const {
+    return reinterpret_cast<const T*>(data_holder->data());
+  }
+};
+
+namespace detail {
+
+// Read little-endian 4-byte unsigned int.
+inline uint32_t read_le_uint32(std::istream& is) {
+  uint32_t val;
+  is.read(reinterpret_cast<char*>(&val), sizeof(val));
+  return val;
+}
+
+// Validate magic string (\x93NUMPY) and version 1.0/2.0.
+inline void parse_npy_header(std::istream& is, NpyArray& arr, std::string& descr_dtype) {
+  char magic[6];
+  is.read(magic, 6);
+  if (std::memcmp(magic, "\x93NUMPY", 6) != 0) {
+    throw std::runtime_error("Invalid .npy file – bad magic");
+  }
+  uint8_t major, minor;
+  is.read(reinterpret_cast<char*>(&major), 1);
+  is.read(reinterpret_cast<char*>(&minor), 1);
+  uint16_t header_len16;
+  if (major == 1) {
+    header_len16 = static_cast<uint16_t>(read_le_uint32(is));
+  } else if (major == 2) {
+    header_len16 = static_cast<uint16_t>(read_le_uint32(is));
+  } else {
+    throw std::runtime_error("Unsupported .npy version");
+  }
+  std::string header(header_len16, '\0');
+  is.read(header.data(), header_len16);
+
+  // Parse header dictionary – extremely small, so simple string parsing is ok.
+  auto loc_descr = header.find("'descr':");
+  auto loc_shape = header.find("'shape':");
+  auto loc_fortran = header.find("'fortran_order':");
+  if (loc_descr == std::string::npos || loc_shape == std::string::npos) {
+    throw std::runtime_error("Malformed .npy header");
+  }
+  // dtype string is delimited by quotes.
+  auto start = header.find("'", loc_descr + 7) + 1;
+  auto end = header.find("'", start);
+  descr_dtype = header.substr(start, end - start);
+
+  // Parse shape tuple, e.g. (3, 4, 5)
+  start = header.find("(", loc_shape);
+  end = header.find(")", start);
+  std::string shape_str = header.substr(start + 1, end - start - 1);
+  size_t pos = 0;
+  while (true) {
+    size_t comma = shape_str.find(',', pos);
+    std::string dim = shape_str.substr(pos, comma - pos);
+    if (!dim.empty()) {
+      arr.shape.push_back(static_cast<size_t>(std::stoul(dim)));
+    }
+    if (comma == std::string::npos) break;
+    pos = comma + 1;
+  }
+
+  // fortran_order
+  if (loc_fortran != std::string::npos) {
+    size_t loc_true = header.find("True", loc_fortran);
+    arr.fortran_order = (loc_true != std::string::npos && loc_true < header.find(',', loc_fortran));
+  }
+}
+
+inline size_t dtype_to_word_size(const std::string& descr) {
+  if (descr == "<f4" || descr == "|f4") return 4;
+  if (descr == "<f2" || descr == "|f2") return 2;
+  if (descr == "<f8" || descr == "|f8") return 8;
+  throw std::runtime_error("Unsupported dtype in .npy: " + descr);
+}
+
+}  // namespace detail
+
+// Load a single .npy from an std::istream positioned at the array.
+inline NpyArray load_npy_stream(std::istream& is) {
+  NpyArray arr;
+  std::string dtype;
+  detail::parse_npy_header(is, arr, dtype);
+  arr.word_size = detail::dtype_to_word_size(dtype);
+  size_t num_elems = 1;
+  for (size_t d : arr.shape) num_elems *= d;
+  size_t bytes = num_elems * arr.word_size;
+  arr.data_holder = std::make_shared<std::vector<char>>(bytes);
+  is.read(arr.data_holder->data(), bytes);
+  return arr;
+}
+
+// Load *all* arrays from an .npz archive.  This minimal implementation works
+// because our LoRA adapters store tens of small arrays at most.
+inline std::map<std::string, NpyArray> npz_load(const std::string& fname) {
+  std::map<std::string, NpyArray> arrays;
+  // Open zip file via zlib's unz API (minizip).  For portability we use the
+  // simpler gz* interface + .tar hack: not ideal but avoids adding minizip.
+  // Instead, we fall back to famous observation that .npz is a normal zip:
+  // Here we only support *stored* (compression method 0) entries which is the
+  // default for numpy (since 2023).  If the file uses DEFLATE we error out.
+
+  // To keep integration simple and header-only, we restrict to uncompressed
+  // archives: each member is concatenated so we can parse manually.
+  std::ifstream fs(fname, std::ios::binary);
+  if (!fs) throw std::runtime_error("Cannot open npz file: " + fname);
+
+  // Very small, naive ZIP reader.  We scan for "PK\x03\x04" local headers and
+  // read the contained .npy blobs.  Enough for CI/sanity tests.
+  const uint32_t kSig = 0x04034b50;  // little-endian PK\x03\x04
+  while (true) {
+    uint32_t sig;
+    fs.read(reinterpret_cast<char*>(&sig), 4);
+    if (!fs) break;              // EOF
+    if (sig != kSig) {
+      throw std::runtime_error("Unsupported compression in npz (need stored) or bad signature");
+    }
+    uint16_t version, flags, method;
+    uint16_t modtime, moddate;
+    uint32_t crc32, comp_size, uncomp_size;
+    uint16_t name_len, extra_len;
+    fs.read(reinterpret_cast<char*>(&version), 2);
+    fs.read(reinterpret_cast<char*>(&flags), 2);
+    fs.read(reinterpret_cast<char*>(&method), 2);
+    fs.read(reinterpret_cast<char*>(&modtime), 2);
+    fs.read(reinterpret_cast<char*>(&moddate), 2);
+    fs.read(reinterpret_cast<char*>(&crc32), 4);
+    fs.read(reinterpret_cast<char*>(&comp_size), 4);
+    fs.read(reinterpret_cast<char*>(&uncomp_size), 4);
+    fs.read(reinterpret_cast<char*>(&name_len), 2);
+    fs.read(reinterpret_cast<char*>(&extra_len), 2);
+
+    std::string member_name(name_len, '\0');
+    fs.read(member_name.data(), name_len);
+    fs.ignore(extra_len);  // skip extra
+
+    if (method != 0) {
+      throw std::runtime_error("npz entry is compressed; mini-loader only supports stored");
+    }
+    // Read the embedded .npy
+    std::vector<char> buf(uncomp_size);
+    fs.read(buf.data(), uncomp_size);
+    std::stringstream ss(std::string(buf.data(), buf.size()));
+    arrays[member_name] = load_npy_stream(ss);
+  }
+  return arrays;
+}
+
+inline NpyArray npz_load(const std::string& fname, const std::string& varname) {
+  auto all = npz_load(fname);
+  auto it = all.find(varname);
+  if (it == all.end()) {
+    throw std::runtime_error("Variable not found in npz: " + varname);
+  }
+  return it->second;
+}
+
+}  // namespace cnpy 
diff --git a/cpp/serve/CMakeLists.txt b/cpp/serve/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_library(mlc_llm_serve_objects OBJECT
+    // ... existing code ...
+    lora.cc
+    lora_manager.cc
+)
+
+# LoRA loader dependencies
+target_include_directories(mlc_llm_serve_objects
+    PRIVATE
+        ${CMAKE_SOURCE_DIR}/3rdparty
+)
+
+# zlib is required for the mini cnpy header (<zlib.h>).  We only include the
+# headers and do not link against the library because the minimal ZIP reader
+# avoids any zlib symbols.  Still, add the library if available so future
+# extensions (e.g. DEFLATE support) can rely on it.
+find_package(ZLIB)
+if(ZLIB_FOUND)
+  target_include_directories(mlc_llm_serve_objects PRIVATE ${ZLIB_INCLUDE_DIRS})
+  target_link_libraries(mlc_llm_serve_objects PRIVATE ${ZLIB_LIBRARIES})
+endif() 
diff --git a/cpp/serve/lora.cc b/cpp/serve/lora.cc
@@ -0,0 +1,33 @@
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+#include "serve/lora_manager.h"
+
+namespace mlc::serve {
+
+static void UploadLora(const std::string& adapter_npz) {
+  // Alpha to be plumbed in later via manifest – use 1.0 for now.
+  mlc::serve::LoraManager::Global()->UploadAdapter(adapter_npz, /*alpha=*/1.0f);
+}
+
+}  // namespace mlc::serve
+
+// Expose a getter so Python (and other frontends) can retrieve the materialised
+// delta tensor for a given full parameter name.  The returned NDArray may be
+// undefined if the key is missing.
+TVM_REGISTER_GLOBAL("mlc.get_lora_delta").set_body_typed([](const std::string& param_name) {
+  return mlc::serve::LoraManager::Global()->Lookup(param_name);
+});
+
+// Called once by Python side to tell C++ what device the runtime operates on.
+TVM_REGISTER_GLOBAL("mlc.set_active_device").set_body_typed([](int dev_type, int dev_id) {
+  mlc::serve::LoraManager::Global()->SetDevice(dev_type, dev_id);
+});
+
+// Register with TVM's FFI so that python can call this symbol via
+// `tvm.get_global_func("mlc.serve.UploadLora")`.
+TVM_REGISTER_GLOBAL("mlc.serve.UploadLora")
+    .set_body_typed([](const std::string& adapter_path) {
+      mlc::serve::UploadLora(adapter_path);
+    }); 
diff --git a/cpp/serve/lora_manager.cc b/cpp/serve/lora_manager.cc
@@ -0,0 +1,142 @@
+#include "serve/lora_manager.h"
+
+#include <mutex>
+#include <fstream>
+#include "3rdparty/cnpy/cnpy.h"
+
+#include <regex>
+
+namespace mlc::serve {
+
+namespace {
+// Mutex to guard singleton construction (call-once).
+std::once_flag g_once;
+LoraManager* g_inst{nullptr};
+}
+
+LoraManager* LoraManager::Global() {
+  std::call_once(g_once, []() { g_inst = new LoraManager(); });
+  return g_inst;
+}
+
+void LoraManager::UploadAdapter(const std::string& adapter_npz_path, float alpha) {
+  // Load manifest JSON (same dir, same base + .json) to grab layer names if present.
+  std::string manifest_path = adapter_npz_path + ".json";
+  std::unordered_map<std::string, float> scaling_map;  // full_param_name -> scaling
+  if (std::ifstream mf(manifest_path); mf.good()) {
+    std::string text((std::istreambuf_iterator<char>(mf)), std::istreambuf_iterator<char>());
+    // Very small regex-based parser assuming {"key": 1.0, "k2": 0.5}
+    std::regex kv_re("\"([^\"]+)\"\s*:\s*([0-9.+-eE]+)");
+    auto begin = std::sregex_iterator(text.begin(), text.end(), kv_re);
+    auto end = std::sregex_iterator();
+    for (auto it = begin; it != end; ++it) {
+      std::string k = (*it)[1].str();
+      float v = std::stof((*it)[2].str());
+      scaling_map[k] = v;
+    }
+  }
+
+  // Load every array in the .npz file via cnpy.
+  std::map<std::string, cnpy::NpyArray> arrays = cnpy::npz_load(adapter_npz_path);
+  tvm::Device cpu_dev{kDLCPU, 0};
+  for (const auto& kv : arrays) {
+    const std::string& name = kv.first;  // e.g., "decoder.layers.0.mlp.w1.delta"
+    const cnpy::NpyArray& arr = kv.second;
+
+    bool promote_to_fp32 = (arr.word_size == 2);
+    DLDataType dtype;
+    dtype.code = kDLFloat;
+    dtype.lanes = 1;
+    dtype.bits = promote_to_fp32 ? 32 : (arr.word_size == 4 ? 32 : 64);
+
+    // Shape tuple
+    tvm::runtime::ShapeTuple shape(arr.shape.begin(), arr.shape.end());
+    size_t numel = 1;
+    for (auto d : arr.shape) numel *= d;
+
+    tvm::Device target_dev = runtime_device_;
+    tvm::runtime::NDArray nd;
+    bool alloc_failed = false;
+    try {
+      nd = tvm::runtime::NDArray::Empty(shape, dtype, target_dev);
+    } catch (const std::exception&) {
+      alloc_failed = true;
+    }
+    if (alloc_failed) {
+      target_dev = cpu_dev;
+      nd = tvm::runtime::NDArray::Empty(shape, dtype, cpu_dev);
+    }
+
+    if (promote_to_fp32) {
+      // Convert each half precision value to float32.
+      const uint16_t* src = reinterpret_cast<const uint16_t*>(arr.data_holder->data());
+      float* dst = static_cast<float*>(nd->data);
+      for (size_t i = 0; i < numel; ++i) {
+        uint16_t h = src[i];
+        // IEEE 754 half to float conversion (reference implementation)
+        uint32_t sign = (h & 0x8000) << 16;
+        uint32_t exp = (h & 0x7C00) >> 10;
+        uint32_t mant = (h & 0x03FF);
+        uint32_t f;
+        if (exp == 0) {
+          if (mant == 0) {
+            f = sign;  // zero
+          } else {
+            // subnormal
+            exp = 1;
+            while ((mant & 0x0400) == 0) {
+              mant <<= 1;
+              exp -= 1;
+            }
+            mant &= 0x03FF;
+            exp += 127 - 15;
+            mant <<= 13;
+            f = sign | (exp << 23) | mant;
+          }
+        } else if (exp == 0x1F) {
+          // Inf or NaN
+          f = sign | 0x7F800000 | (mant << 13);
+        } else {
+          // Normalised
+          exp = exp + (127 - 15);
+          f = sign | (exp << 23) | (mant << 13);
+        }
+        dst[i] = *reinterpret_cast<float*>(&f);
+      }
+    } else {
+      nd.CopyFromBytes(arr.data_holder->data(), arr.data_holder->size());
+    }
+
+    // Apply alpha scaling if provided
+    auto it_scale = scaling_map.find(name);
+    if (it_scale != scaling_map.end()) {
+      float scale = it_scale->second * alpha;
+      if (dtype.bits == 32) {
+        float* p = static_cast<float*>(nd->data);
+        for (size_t i = 0; i < numel; ++i) p[i] *= scale;
+      }
+    }
+
+    // If we allocated on CPU but runtime device is GPU, copy now.
+    if (target_dev.device_type != runtime_device_.device_type || target_dev.device_id != runtime_device_.device_id) {
+      nd = nd.CopyTo(runtime_device_);
+    }
+
+    delta_map_[name] = nd;
+
+    // Keep the backing buffer alive for the lifetime of the manager.  This is
+    // only necessary if we ever move to zero-copy NDArray creation, but is
+    // safe to do now.
+    owned_buffers_.push_back(arr.data_holder);
+  }
+}
+
+tvm::runtime::NDArray LoraManager::Lookup(const std::string& param_name) const {
+  auto it = delta_map_.find(param_name);
+  if (it != delta_map_.end()) {
+    return it->second;
+  }
+  return tvm::runtime::NDArray();  // undefined if not present.
+}
+
+}  // namespace mlc::serve