diff --git a/.gitignore b/.gitignore index 41bbbee..0ff7a2d 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,13 @@ .idea/ .vscode/ *.code-workspace + +# Local cargo config (machine-specific: LIBCLANG_PATH etc.) +.cargo/config.toml + +# Claude Code local settings +.claude/ + +# Local temporary build artifacts +yes/ +target-codex/ diff --git a/Cargo.lock b/Cargo.lock index 13798b7..0b8ea51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -56,7 +56,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -67,7 +67,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -334,6 +334,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -353,7 +359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -600,6 +606,7 @@ dependencies = [ "tracing", "tracing-subscriber", "uuid", + "windows-sys 0.59.0", ] [[package]] @@ -609,6 +616,8 @@ dependencies = [ "bindgen", "cc", "cmake", + "dunce", + "serde_json", ] [[package]] @@ -789,7 +798,7 @@ checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -817,7 +826,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -990,7 +999,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -1136,7 +1145,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -1186,7 +1195,7 @@ dependencies = [ "getrandom", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -1232,7 +1241,7 @@ dependencies = [ "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -1687,6 +1696,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.61.2" diff --git a/Cargo.toml b/Cargo.toml index a269105..2fb43bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,5 +52,13 @@ memmap2 = "0.9" linked-hash-map = "0.5" uuid = { version = "1", features = ["v4"] } +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.59", features = [ + "Win32_Foundation", + "Win32_Storage_FileSystem", + "Win32_System_IO", + "Win32_System_Memory", +] } + [dev-dependencies] tempfile = "3" diff --git a/README.md b/README.md index 3a53576..b0abf7b 100644 --- a/README.md +++ b/README.md @@ -5,84 +5,371 @@ | _ | |_| | |_) | |_| | | | (_| | |_| |_|\__, | .__/ \__,_|_| \__,_| |___/|_| - Run models too big for your Mac's memory + メモリに収まらないモデルを動かす / Run models too big for your memory ``` -Hypura is a storage-tier-aware LLM inference scheduler for Apple Silicon. -It places model tensors across GPU, RAM, and NVMe tiers based on access -patterns, bandwidth costs, and hardware capabilities — enabling models -that exceed physical memory to run without crashing the system. +--- -Run a 31 GB Mixtral 8x7B on a 32 GB Mac Mini at 2.2 tok/s. A 40 GB Llama 70B at 0.3 tok/s. Vanilla llama.cpp crashes on both. +## 概要 + +Hypura はストレージ階層を意識した LLM 推論スケジューラです。 +モデルのテンソルを GPU・RAM・NVMe にアクセスパターン・帯域幅・ハードウェア特性に基づいて自動配置し、**物理メモリを超える大規模モデルをクラッシュなしに動作させます**。 + +- 31 GB の Mixtral 8x7B を 32 GB マシンで **2.2 tok/s** で実行 +- 40 GB の Llama 3.3 70B を 32 GB マシンで **0.3 tok/s** で実行 +- vanilla llama.cpp は両方 OOM でクラッシュ + +**対応プラットフォーム:** + +| プラットフォーム | GPU | NVMe I/O | +|---|---|---| +| macOS (Apple Silicon) | Metal (`F_NOCACHE` + `pread`) | ✅ | +| Windows ネイティブ | CUDA RTX 2060+ (`FILE_FLAG_NO_BUFFERING` + `ReadFile`) | ✅ | +| WSL2 (Windows) | CUDA RTX 2060+ (`posix_fadvise` + `pread`) | ✅ | +| Linux | CUDA RTX 2060+ (`posix_fadvise` + `pread`) | ✅ | + +--- + +## なぜ必要か + +コンシューマ向けハードウェア(MacBook Pro、RTX 3060 搭載 PC など)は高速な統合メモリや NVMe ストレージを搭載していますが、容量に限界があります。32 GB のマシンで 40 GB のモデルをナイーブに読み込もうとすると、OS がスワップを繰り返し OOM キラーが介入します。 + +Hypura はモデルアーキテクチャを理解することでこの問題を解決します: + +- **Norms・Embeddings** — 小さいが毎トークンアクセスされる → GPU に固定 +- **MoE エキスパートルーティング** — スパース性を利用。8 エキスパート中 2 つしか発火しない。ルーターインターセプションで選択されたエキスパートを識別し、GGUF ファイルから必要なストライドのみ NVMe ストリーミング(I/O 75% 削減)。ニューロンキャッシュが時間局所性を活かし 99.5% ヒット率を達成。共活性化追跡で投機的プリフェッチを実現 +- **Dense FFN ウェイト** — gate/up/down ウェイト(モデルサイズの約 60%)をプールバッファ経由で NVMe からストリーミング。アテンション・Norms は GPU 常駐 + +--- + +## 動作原理 + +Hypura は GGUF ファイルを読み込み、ハードウェアをプロファイリング(GPU 作業セット、RAM、NVMe 帯域幅)し、すべてのテンソルを最適な階層に割り当てる配置最適化を解きます。 + +**推論モードの自動選択:** + +| モード | 条件 | 説明 | +|---|---|---| +| **Full-resident** | モデルが GPU+RAM に収まる | NVMe I/O なし。フル GPU 速度 | +| **Expert-streaming** | MoE モデル(Mixtral 等)| 非エキスパートテンソル(~1 GB)のみ GPU。エキスパートは NVMe から on-demand ストリーミング | +| **Dense-FFN-streaming** | 大規模 Dense モデル(Llama 70B 等)| アテンション+Norms を GPU に(~8 GB)。FFN テンソルは NVMe からストリーミング | + +プールバッファサイズ・プリフェッチ深度・メモリバジェットはハードウェアプロファイルから自動計算されます。 + +--- + +## NVMe ストリーミング — Windows 対応 + +Windows でも macOS と同等の NVMe キャッシュバイパス読み出しが動作します。 + +| 機能 | macOS | Linux/WSL2 | Windows | +|---|---|---|---| +| キャッシュバイパスオープン | `F_NOCACHE` | `O_DIRECT` | `FILE_FLAG_NO_BUFFERING` | +| ランダムオフセット読み出し | `pread(2)` | `pread(2)` | `ReadFile` + `OVERLAPPED` | +| 匿名ページ割り当て | `mmap(MAP_ANON)` | `mmap(MAP_ANON)` | `VirtualAlloc` | +| ページ解放ヒント | `madvise(MADV_FREE)` | `madvise(MADV_DONTNEED)` | `VirtualFree(MEM_DECOMMIT)` | + +これらの違いは `src/io/compat.rs` の統一 API の背後に隠蔽されており、上位レイヤー(IoPool、NvmePrefetcher、iobench)はプラットフォームを意識しません。 + +--- + +## パフォーマンス + +**M1 Max、32 GB 統合メモリ、NVMe シーケンシャル読み取り ~5.1 GB/s での計測値** + +| モデル | サイズ | GPU | NVMe | モード | Hypura | llama.cpp | 備考 | +|---|---|---|---|---|---|---|---| +| Qwen 2.5 14B Q4_K_M | 8.4 GB | 8.4 GB | — | full-resident | **21 tok/s** | ~21 tok/s | GPU 収容、オーバーヘッドなし | +| Mixtral 8x7B Q5_K_M | 30.9 GB | 1.1 GB | 29.8 GB | expert-streaming | **2.2 tok/s** | **OOM** | 全層 Metal、キャッシュヒット率 99.5% | +| Llama 3.3 70B Q4_K_M | 39.6 GB | 7.8 GB | 31.8 GB | dense-FFN-streaming | **0.3 tok/s** | **OOM** | 全層 Metal、24 スロット動的プール、7 層プリフェッチ | + +--- + +## インストール + +Rust 1.75+ と CMake が必要です(vendored llama.cpp のビルドに使用)。 + +### macOS (Apple Silicon) + +```sh +git clone --recurse-submodules https://github.com/zapabob/hypura.git +cd hypura +cargo build --release +``` + +### WSL2 (Windows) + +CUDA ツールキット(12.x 推奨)がインストールされていることを確認してください。 + +```sh +# WSL2 ターミナル内で実行 +git clone --recurse-submodules https://github.com/zapabob/hypura.git +cd hypura +cargo build --release +``` + +RTX 3060 以上では CUDA アーキテクチャ `sm_86` がデフォルトターゲットです(20xx: sm_75、40xx: sm_89、H100: sm_90)。 + +### Windows ネイティブ + +```powershell +# PowerShell または Git Bash +git clone --recurse-submodules https://github.com/zapabob/hypura.git +cd hypura + +# CUDA_PATH 環境変数を設定(例: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4) +$env:CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + +cargo build --release +``` + +バイナリは `target/release/hypura.exe` に生成されます。 + +--- + +## クイックスタート + +```sh +# ハードウェアプロファイリング(初回のみ、結果はキャッシュされる) +hypura profile + +# GGUF モデルで推論 +hypura run ./model.gguf --prompt "Hello, world" + +# インタラクティブチャット +hypura run ./model.gguf --interactive + +# ベンチマーク: Hypura vs ナイーブベースライン +hypura bench ./model.gguf + +# モデル配置プランの確認(ロードなし) +hypura inspect ./model.gguf + +# NVMe I/O マイクロベンチマーク +hypura iobench ./model.gguf --read-gb 1.0 + +# エキスパートレイアウト最適化(Mixtral 等の MoE モデル) +hypura optimize ./model.gguf +``` + +未テストモデルでは最初に `--max-tokens 10` から始めることを推奨します。 + +--- + +## Ollama 互換サーバー + +Hypura は Ollama 互換 HTTP API を公開しており、Ollama に対応したツール([OpenClaw](https://github.com/openclaw/openclaw) など)のドロップイン代替として機能します。 + +```sh +hypura serve ./model.gguf +# Hypura serving Mixtral 8x7B Instruct v0.1 +# Endpoint: http://127.0.0.1:8080 +# Ollama-compatible API: /api/generate, /api/chat, /api/tags +``` + +### エンドポイント + +| エンドポイント | 説明 | +|---|---| +| `GET /` | ヘルスチェック | +| `GET /api/tags` | 読み込み済みモデル一覧 | +| `GET /api/version` | サーバーバージョン | +| `POST /api/show` | モデルメタデータ | +| `POST /api/generate` | テキスト補完(ストリーミング NDJSON または単一レスポンス)| +| `POST /api/chat` | チャット補完(ストリーミング NDJSON または単一レスポンス)| + +### OpenClaw との連携 + +```sh +openclaw config set models.providers.ollama.baseUrl "http://127.0.0.1:8080" +``` + +--- + +## アーキテクチャ + +Hypura は2つのクレートからなる Cargo ワークスペースです。 + +- **`hypura`** — メインバイナリ+ライブラリ。CLI は `src/main.rs`、ロジックは `src/lib.rs` モジュール群 +- **`hypura-sys`** — llama.cpp の FFI バインディング(`vendor/llama.cpp/` に vendored、CMake でビルド) + +### 主要モジュール + +| モジュール | 目的 | +|---|---| +| `io/compat.rs` | プラットフォーム抽象化(macOS/Linux/Windows の I/O プリミティブ統一)| +| `scheduler/placement.rs` | LP + greedy テンソル配置最適化(GPU/RAM/NVMe 階層)| +| `compute/inference.rs` | 推論エンジン: `generate_blocking`、`generate_with_nvme_scheduling` | +| `compute/nvme_backend.rs` | カスタム GGML バッファ型、エキスパート/FFN ストリーミング、ニューロンキャッシュ | +| `server/routes.rs` | Ollama 互換 API の Axum HTTP ハンドラ | +| `profiler/` | ハードウェア検出(CPU/GPU/メモリ帯域幅/NVMe スループット)| +| `cli/bench.rs` | A/B ベンチマークハーネス | +| `model/tensor_role.rs` | テンソル分類(配置スコアリング用)| +| `cache/coactivation.rs` | エキスパート共活性化追跡(投機的プリフェッチ用)| +| `cache/neuron_cache.rs` | 読み込み済みエキスパートスライスの LRU キャッシュ | + +--- + +## FAQ + +### SSD が壊れませんか? + +いいえ。**Hypura は推論中に SSD への書き込みを一切行いません。** + +SSD の劣化は書き込みサイクル(NAND フラッシュの P/E サイクル)によって生じます。読み取りはフラッシュセルを劣化させません。Hypura の NVMe I/O パスは読み取り専用で、GGUF ファイルからテンソルウェイトを RAM/GPU メモリプールにストリーミングするだけです。SSD はコールドストレージとして使用されます。 + +唯一の書き込みは無視できる程度です: ベンチマーク結果 JSON(~KB)、共活性化統計(~KB)、`hypura optimize` コマンド(任意実行時の1回限り)。通常の推論では SSD 書き込みはゼロです。 + +### Windows でも NVMe ストリーミングは動きますか? + +はい。`FILE_FLAG_NO_BUFFERING` + `ReadFile(OVERLAPPED)` が macOS の `F_NOCACHE` + `pread` と同等の機能を提供します。詳細は `_docs/windows-wsl2-port.md` を参照してください。 + +--- + +## 安全上の注意 + +- `bench --baseline` はモデルが RAM − 4 GB ヘッドルームを超える場合にブロックされます。`--force` で上書き可能ですが自己責任で +- 未テストモデルでは必ず `--max-tokens 10` から始めてください +- テストモデルは `./test-models/` に置いてください(リポジトリには含めない) + +--- + +## ライセンス + +MIT + +--- + +## Ethics + +このリポジトリのコードは私が自分で書いたものではありません。このプロジェクトは LLM を使って私の指示に基づいてタスクを実行するという探求です。NVMe を活用した推論はメモリの一形態として(低速ではあるが)十分に有効であるにもかかわらず、未活用であるという直感から始まりました。 + +--- +--- + +## Overview + +Hypura is a storage-tier-aware LLM inference scheduler. +It places model tensors across GPU, RAM, and NVMe tiers based on access patterns, bandwidth costs, and hardware capabilities — enabling **models larger than physical memory to run without crashing**. + +- Run a 31 GB Mixtral 8x7B on a 32 GB machine at **2.2 tok/s** +- Run a 40 GB Llama 3.3 70B on a 32 GB machine at **0.3 tok/s** +- Vanilla llama.cpp OOMs on both + +**Supported platforms:** + +| Platform | GPU backend | NVMe direct I/O | +|---|---|---| +| macOS (Apple Silicon) | Metal (`F_NOCACHE` + `pread`) | ✅ | +| Windows native | CUDA RTX 2060+ (`FILE_FLAG_NO_BUFFERING` + `ReadFile`) | ✅ | +| WSL2 (Windows) | CUDA RTX 2060+ (`posix_fadvise` + `pread`) | ✅ | +| Linux | CUDA RTX 2060+ (`posix_fadvise` + `pread`) | ✅ | + +--- ## Why does this matter? -Consumer hardware (MacBook Pro, Mac Studio) ships with fast unified memory -and NVMe storage, but limited capacity. A 32 GB M1 Max cannot naively load -a 40 GB model — the OS will swap-thrash until the OOM killer intervenes. +Consumer hardware ships with fast GPU/unified memory and NVMe storage, but limited capacity. A 32 GB machine can't naively load a 40 GB model — the OS will swap-thrash until the OOM killer intervenes. -Hypura solves this by understanding the model architecture: +Hypura solves this by understanding model architecture: - **Norms and embeddings** are tiny but accessed every token — pinned to GPU -- **MoE expert routing** exploits sparsity — only 2 of 8 experts fire per token. - Router interception identifies selected experts in the eval callback, then loads - only the needed expert strides from NVMe (75% I/O reduction). A neuron cache tracks - loaded expert slices across tokens, achieving 99.5% hit rate from temporal locality. - Co-activation tracking predicts which experts will fire next for speculative prefetch. -- **Dense FFN weights** (gate, up, down — ~60% of model size) stream from NVMe through - a dynamically-sized pool buffer while attention + norms stay GPU-resident. Prefetch - lookahead depth scales automatically with available memory. - -The result: models that would crash your machine under naive mmap become runnable. -Models that fit in memory run at full Metal GPU speed with zero overhead. +- **MoE expert routing** exploits sparsity — only 2 of 8 experts fire per token. Router interception identifies selected experts in the eval callback, then loads only the needed expert strides from NVMe (75% I/O reduction). A neuron cache tracks loaded expert slices, achieving 99.5% hit rate from temporal locality. Co-activation tracking predicts next experts for speculative prefetch. +- **Dense FFN weights** (gate, up, down — ~60% of model size) stream from NVMe through a dynamically-sized pool buffer while attention + norms stay GPU-resident. Prefetch lookahead scales with available memory. + +--- ## How it works -Hypura reads the GGUF file, profiles your hardware (GPU working set, RAM, NVMe bandwidth), -and solves a placement optimization that assigns every tensor to a tier: +Hypura reads the GGUF file, profiles your hardware (GPU working set, RAM, NVMe bandwidth), and solves a placement optimization assigning every tensor to a tier: -- **GPU (Metal)** — Attention layers, norms, embeddings. Fastest access, limited by `recommendedMaxWorkingSetSize`. -- **RAM** — Overflow layers that don't fit in the GPU working set. Accessed via mmap. -- **NVMe** — Remaining layers loaded on-demand via direct I/O (`F_NOCACHE` + `pread`), prefetched ahead of the forward pass. +- **GPU** — Attention, norms, embeddings. Fastest access, limited by GPU working set. +- **RAM** — Overflow layers that don't fit in the GPU working set. +- **NVMe** — Remaining layers loaded on-demand via direct I/O, prefetched ahead of the forward pass. -Hypura selects the best inference mode automatically based on model size, architecture, and available memory: +**Inference mode selection (automatic):** -- **Full-resident** — Model fits in GPU+RAM. No NVMe I/O. Full Metal speed. -- **Expert-streaming** — For MoE models (Mixtral). Only non-expert tensors (~1 GB) stay on GPU. Expert tensors stream from NVMe through a pool buffer on demand, with a neuron cache (99.5% hit rate) that eliminates most I/O after warmup. -- **Dense FFN-streaming** — For dense models too large for GPU (Llama 70B). Attention + norms stay on GPU (~8 GB). FFN tensors (~32 GB) stream from NVMe through a dynamically-sized pool buffer, with scaled prefetch lookahead. +| Mode | Condition | Description | +|---|---|---| +| **Full-resident** | Model fits in GPU+RAM | No NVMe I/O. Full GPU speed. | +| **Expert-streaming** | MoE models (Mixtral, etc.) | Only non-expert tensors (~1 GB) on GPU. Expert tensors stream from NVMe on demand. | +| **Dense-FFN-streaming** | Large dense models (Llama 70B, etc.) | Attention+norms on GPU (~8 GB). FFN tensors stream from NVMe. | -Pool buffer size, prefetch depth, and memory budgets are computed automatically from your hardware profile — no manual tuning needed. +Pool buffer sizes, prefetch depth, and memory budgets are computed automatically from your hardware profile. + +--- + +## NVMe Streaming — Windows Support + +Windows supports the same cache-bypass NVMe reads as macOS, implemented via `FILE_FLAG_NO_BUFFERING` + `ReadFile(OVERLAPPED)`. + +| Feature | macOS | Linux/WSL2 | Windows | +|---|---|---|---| +| Cache-bypass open | `F_NOCACHE` | `O_DIRECT` | `FILE_FLAG_NO_BUFFERING` | +| Positional read | `pread(2)` | `pread(2)` | `ReadFile` + `OVERLAPPED` | +| Anonymous pages | `mmap(MAP_ANON)` | `mmap(MAP_ANON)` | `VirtualAlloc` | +| Page release hint | `madvise(MADV_FREE)` | `madvise(MADV_DONTNEED)` | `VirtualFree(MEM_DECOMMIT)` | + +All platform differences are hidden behind the unified API in `src/io/compat.rs`. Upper layers (IoPool, NvmePrefetcher, iobench) are platform-agnostic. + +--- ## Performance -All benchmarks on **M1 Max, 32 GB unified memory, ~5.1 GB/s NVMe sequential read**. +**All benchmarks on M1 Max, 32 GB unified memory, ~5.1 GB/s NVMe sequential read.** | Model | Size | GPU | NVMe | Mode | Hypura | llama.cpp | Notes | |---|---|---|---|---|---|---|---| | Qwen 2.5 14B Q4_K_M | 8.4 GB | 8.4 GB | — | full-resident | **21 tok/s** | ~21 tok/s | Fits in GPU; no overhead | | Mixtral 8x7B Q5_K_M | 30.9 GB | 1.1 GB | 29.8 GB | expert-streaming | **2.2 tok/s** | **OOM** | All layers on Metal; 99.5% cache hit rate | -| Llama 3.3 70B Q4_K_M | 39.6 GB | 7.8 GB | 31.8 GB | dense-FFN-streaming | **0.3 tok/s** | **OOM** | All layers on Metal; dynamic 24-slot pool, 7-layer prefetch | +| Llama 3.3 70B Q4_K_M | 39.6 GB | 7.8 GB | 31.8 GB | dense-FFN-streaming | **0.3 tok/s** | **OOM** | All layers on Metal; 24-slot dynamic pool, 7-layer prefetch | -**Key takeaway:** For models that fit in memory, Hypura adds zero overhead. For models that don't fit, Hypura is the difference between "runs" and "crashes." Expert-streaming on Mixtral achieves usable interactive speeds by keeping only non-expert tensors on GPU and exploiting MoE sparsity (only 2/8 experts fire per token). Dense FFN-streaming extends this to non-MoE models like Llama 70B. Pool sizes and prefetch depth scale automatically with available memory. +--- ## Install -Hypura builds from source with Cargo. You'll need Rust 1.75+ and CMake (for the vendored llama.cpp). +Requires Rust 1.75+ and CMake (for vendored llama.cpp). + +### macOS (Apple Silicon) + +```sh +git clone --recurse-submodules https://github.com/zapabob/hypura.git +cd hypura +cargo build --release +``` + +### WSL2 (Windows) + +Ensure CUDA Toolkit (12.x recommended) is installed. ```sh -git clone --recurse-submodules https://github.com/hypura/hypura.git +git clone --recurse-submodules https://github.com/zapabob/hypura.git cd hypura cargo build --release ``` -The binary is at `target/release/hypura`. +Default CUDA target is `sm_86` (RTX 3060 / Ampere). Supported: `sm_75` (RTX 20xx), `sm_86` (RTX 30xx), `sm_89` (RTX 40xx), `sm_90` (H100). Override with `HYPURA_CUDA_ARCHITECTURES=75;86;89;90`. + +### Windows Native -> Homebrew tap coming soon. +```powershell +git clone --recurse-submodules https://github.com/zapabob/hypura.git +cd hypura + +# Set CUDA Toolkit path +$env:CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + +cargo build --release +``` + +Binary is at `target\release\hypura.exe`. + +--- ## Quick start ```sh -# Profile your hardware (runs once, cached) +# Profile your hardware (runs once, results cached) hypura profile # Run inference on a GGUF model @@ -96,13 +383,21 @@ hypura bench ./model.gguf # Inspect model placement plan without loading hypura inspect ./model.gguf + +# NVMe I/O microbenchmark +hypura iobench ./model.gguf --read-gb 1.0 + +# Expert layout optimization (for MoE models like Mixtral) +hypura optimize ./model.gguf ``` -Start with `--max-tokens 10` on untested models before scaling up. +Start with `--max-tokens 10` on untested models. + +--- ## Ollama-compatible server -Hypura exposes an Ollama-compatible HTTP API, making it a drop-in replacement for any tool that talks to Ollama — including [OpenClaw](https://github.com/openclaw/openclaw). +Hypura exposes an Ollama-compatible HTTP API — a drop-in replacement for any tool that talks to Ollama, including [OpenClaw](https://github.com/openclaw/openclaw). ```sh hypura serve ./model.gguf @@ -124,39 +419,11 @@ hypura serve ./model.gguf ### Usage with OpenClaw -Point OpenClaw at Hypura by setting the Ollama base URL in `~/.openclaw/openclaw.json`: - -```json -{ - "models": { - "providers": { - "ollama": { - "baseUrl": "http://127.0.0.1:8080", - "api": "ollama" - } - } - } -} -``` - -Or via the CLI: - ```sh openclaw config set models.providers.ollama.baseUrl "http://127.0.0.1:8080" ``` -Hypura speaks native Ollama protocol (`/api/chat` with NDJSON streaming), so no compatibility shims are needed. - -### Server options - -``` -hypura serve [OPTIONS] - -Options: - --host Host to bind to [default: 127.0.0.1] - --port Port to bind to [default: 8080] - --context Maximum context length [default: 4096] -``` +--- ## Architecture @@ -169,13 +436,18 @@ Hypura is a Cargo workspace with two crates: | Module | Purpose | |---|---| +| `io/compat.rs` | Platform abstraction (unifies macOS/Linux/Windows I/O primitives) | | `scheduler/placement.rs` | LP + greedy tensor placement across GPU/RAM/NVMe tiers | -| `compute/inference.rs` | Inference engine: `generate_blocking`, `generate_with_nvme_scheduling`, server-oriented `load_model` / `generate_from_loaded` | -| `compute/nvme_backend.rs` | Custom GGML buffer type, pool-based expert/FFN streaming, neuron cache, eval callback | +| `compute/inference.rs` | Inference engine: `generate_blocking`, `generate_with_nvme_scheduling` | +| `compute/nvme_backend.rs` | Custom GGML buffer type, pool-based expert/FFN streaming, neuron cache | | `server/routes.rs` | Axum HTTP handlers for Ollama-compatible API | | `profiler/` | Hardware detection (CPU, GPU, memory bandwidth, NVMe throughput) | | `cli/bench.rs` | A/B benchmark harness | -| `model/tensor_role.rs` | Tensor classification for placement scoring (norms, attention, MoE experts) | +| `model/tensor_role.rs` | Tensor classification for placement scoring | +| `cache/coactivation.rs` | Expert co-activation tracking for speculative prefetch | +| `cache/neuron_cache.rs` | LRU cache for loaded expert slices | + +--- ## FAQ @@ -183,20 +455,30 @@ Hypura is a Cargo workspace with two crates: No. **Hypura only reads from your SSD during inference — it never writes to it.** -SSD wear is caused by write cycles (program/erase cycles on NAND flash cells). Reads do not degrade flash cells. Hypura's entire NVMe I/O path uses read-only `pread()` calls with `F_NOCACHE` to stream tensor weights from the GGUF file into RAM/GPU memory pools, where all computation happens. The SSD is used as cold storage, not as working memory. +SSD wear is caused by write cycles. Reads do not degrade flash cells. Hypura's entire NVMe I/O path uses read-only calls (`pread` / `ReadFile`) with cache bypass to stream tensor weights from the GGUF file into RAM/GPU memory pools. The SSD is used as cold storage, not as working memory. + +The only writes Hypura performs are negligible: benchmark result JSON files (~KB), co-activation statistics (~KB), and the one-time `hypura optimize` command. Normal inference generates zero SSD writes. -The only writes Hypura performs are negligible: benchmark result JSON files (~KB), co-activation statistics (~KB to `~/.hypura/`), and the one-time `hypura optimize` command if you choose to run it. Normal inference generates zero SSD writes. +### Does NVMe streaming work on Windows? + +Yes. `FILE_FLAG_NO_BUFFERING` + `ReadFile(OVERLAPPED)` provides the same functionality as `F_NOCACHE` + `pread` on macOS. See `_docs/windows-wsl2-port.md` for details. + +--- ## Safety notes -- `bench --baseline` is blocked when the model exceeds RAM minus 4 GB headroom. Use `--force` to override at your own risk. +- `bench --baseline` is blocked when the model exceeds RAM minus 4 GB headroom. Use `--force` to override. - Always start with `--max-tokens 10` on untested models. - Test models belong in `./test-models/` (not checked in). +--- + ## License MIT +--- + ## Ethics -I feel morally obligated to say I did *not* write the code in this repository myself. This project is an exploration of using LLMs to carry out tasks based on my direction. The majority of prompts I used to get here were derived using the socratic method, genuine curiosity, and a hunch that NVMe supporting inference is underutilized despite being a (slow but) perfectly valid form of memory. \ No newline at end of file +I feel morally obligated to say I did *not* write the code in this repository myself. This project is an exploration of using LLMs to carry out tasks based on my direction. The majority of prompts I used to get here were derived using the socratic method, genuine curiosity, and a hunch that NVMe-supporting inference is underutilized despite being a (slow but) perfectly valid form of memory. diff --git a/_docs/2026-03-23_dunce-unc-path-fix_claude-sonnet-4-6.md b/_docs/2026-03-23_dunce-unc-path-fix_claude-sonnet-4-6.md new file mode 100644 index 0000000..40661ac --- /dev/null +++ b/_docs/2026-03-23_dunce-unc-path-fix_claude-sonnet-4-6.md @@ -0,0 +1,112 @@ +# 2026-03-23 dunce クレートによる Windows UNC パス問題修正 + +**実装AI:** Claude Sonnet 4.6 +**日付:** 2026-03-23 +**カテゴリ:** バグ修正・Windows ビルド + +--- + +## 症状 + +Windows ネイティブ環境で `cargo check` を実行すると `hypura-sys` のビルドが失敗: + +``` +error C1083: ソース ファイルを開けません。 +'\\?\C:\Users\...\vendor\llama.cpp\ggml\src\gguf.cpp': No such file or directory +``` + +ファイルは実際に存在しており、パスの `\\?\` プレフィックスが問題の原因。 + +--- + +## 根本原因 + +`hypura-sys/build.rs` の以下のコード: + +```rust +let llama_dir = PathBuf::from(&manifest_dir).join("../vendor/llama.cpp"); +let llama_dir = llama_dir.canonicalize().expect("..."); +``` + +**`std::fs::canonicalize()` は Windows で `\\?\C:\...` 形式の拡張長パス (Extended-Length Path) を返す。** + +これが CMake の生成する `*.vcxproj` ファイル内のソースパスとして埋め込まれると、MSBuild がそのパスを解釈できずにソースファイルが見つからないと判断する。 + +``` +通常パス: C:\Users\...\vendor\llama.cpp\ggml\src\gguf.cpp ← MSBuild OK +UNC パス: \\?\C:\Users\...\vendor\llama.cpp\ggml\src\gguf.cpp ← MSBuild NG +``` + +--- + +## 修正内容 + +### `hypura-sys/Cargo.toml` + +```toml +[build-dependencies] +cmake = "0.1" +cc = "1" +bindgen = "0.71" +dunce = "1" # 追加 +``` + +### `hypura-sys/build.rs` + +```rust +// Before: +let llama_dir = llama_dir.canonicalize().expect("..."); + +// After: +// dunce::canonicalize strips the \\?\ UNC prefix that std::fs::canonicalize +// adds on Windows, which would otherwise cause MSBuild to reject source paths. +let llama_dir = dunce::canonicalize(&llama_dir).expect("..."); +``` + +--- + +## `dunce` クレートについて + +[`dunce`](https://crates.io/crates/dunce) は Windows の `\\?\` UNC プレフィックスを通常のパスに変換するユーティリティ。 + +```rust +// dunce::canonicalize の動作 +// Windows: C:\foo\bar (\\?\ を除去) +// Unix: /foo/bar (std::fs::canonicalize と同じ) +``` + +macOS/Linux では `std::fs::canonicalize` と同一の動作をするため、クロスプラットフォーム安全。 + +--- + +## 影響範囲 + +`llama_dir` を起点に生成される全パス(`include`、`ggml/src`、`vendor/cpp-httplib` 等)が正しい形式になる。bindgen の `-I` フラグや `cc::Build` の `.include()` パスも同様に修正される。 + +--- + +## 関連する環境問題 + +### Avast アンチウイルスのブロック + +この修正とは別に、Avast の Real-Time Protection が新しくコンパイルされた `.exe`(cargo build scripts)の実行をブロックする問題が発生した。 + +**症状:** +``` +error: failed to run custom build command for `proc-macro2 v1.0.106` +アクセスが拒否されました。 (os error 5) +``` + +**解決策:** Avast の除外設定にプロジェクトの `target\` ディレクトリを追加: +``` +C:\Users\\Desktop\hypura-main\hypura-main\target\ +``` + +--- + +## 検証 + +`dunce` 修正後の `cargo check`: +- `highs-sys` (HiGHS LP ソルバ): ビルド成功 +- `hypura-sys` (llama.cpp + CUDA): CMake 設定成功、MSBuild 実行中 +- 全 Rust クレート (~220): チェック完了 diff --git a/_docs/2026-03-23_initial-commit-readme-bilingual_claude-sonnet-4-6.md b/_docs/2026-03-23_initial-commit-readme-bilingual_claude-sonnet-4-6.md new file mode 100644 index 0000000..89f637e --- /dev/null +++ b/_docs/2026-03-23_initial-commit-readme-bilingual_claude-sonnet-4-6.md @@ -0,0 +1,96 @@ +# 2026-03-23 初期コミット・README 日英併記リライト・_docs 作成 + +**実装AI:** Claude Sonnet 4.6 +**日付:** 2026-03-23 +**カテゴリ:** ドキュメント・Git 管理 + +--- + +## 背景・動機 + +Windows/WSL2 + CUDA 移植完了後、リポジトリを `zapabob/hypura` に公開するにあたり: +1. Git リポジトリを初期化してコミット可能な状態にする +2. README を macOS 限定から全プラットフォーム対応に更新し、日英併記で書き直す +3. 設計決定・実装詳細を `_docs/` に記録する +4. 型定義警告を 0 件にする + +--- + +## 実装内容 + +### 1. 型警告 0 修正 + +**`src/compute/nvme_backend.rs`** +- `#[cfg(unix)] use std::os::unix::io::IntoRawFd;` を削除 +- `IntoRawFd` は `src/io/compat.rs` に移動済みで `nvme_backend.rs` では不使用 + +**`src/cli/iobench.rs`** +- `fn read_sequential(...)` (dead function) を削除 — 全テスト関数が直接 `read_full` を呼ぶため +- `test_mt_nocache` 内の `struct RawPtr(*mut u8)` インライン構造体ハックを削除 + - Before: `unsafe { struct RawPtr(*mut u8); ... RawPtr(my_buf) }.as_mut_ptr()` + - After: 直接 `my_buf` を使用 + +### 2. `.gitignore` 作成 + +``` +/target/ +vendor/llama.cpp/build/ +vendor/llama.cpp/build-*/ +benchmarks/results/*.json +test-models/ +*.gguf +.DS_Store +``` + +### 3. `.gitmodules` 作成 + +```ini +[submodule "vendor/llama.cpp"] + path = vendor/llama.cpp + url = https://github.com/ggerganov/llama.cpp.git + branch = master +``` + +### 4. Git 初期化・コミット + +```sh +git init +git remote add origin https://github.com/zapabob/hypura.git +git add ... # 79 ファイル +git commit -m "feat: initial commit — cross-platform LLM inference scheduler" +``` + +コミット結果: **79 ファイル、16,482 行** + +### 5. README.md 全面リライト (日英併記) + +**構造:** +``` +[ASCII art + bilingual tagline] +日本語セクション (概要 / なぜ必要か / 動作原理 / NVMe / パフォーマンス / インストール / ...) +--- +English section (Overview / Why / How / NVMe / Performance / Install / ...) +``` + +**主な更新内容:** +- "Apple Silicon のみ" → 4 プラットフォーム対応表(macOS Metal / Windows CUDA / WSL2 CUDA / Linux CUDA) +- Windows NVMe セクション新設: `FILE_FLAG_NO_BUFFERING` + `ReadFile(OVERLAPPED)` = macOS `F_NOCACHE` + `pread` +- プラットフォーム別インストール手順(macOS / WSL2 / Windows ネイティブ) +- CUDA アーキテクチャ説明(sm_86 = RTX 3060 ベース) +- `hypura iobench` コマンド追加 + +### 6. `_docs/` 実装ログ作成 + +| ファイル | 内容 | +|---|---| +| `_docs/README.md` | ディレクトリ索引(日英) | +| `_docs/implementation-log.md` | Phase 1〜9 の時系列実装ログ | +| `_docs/windows-wsl2-port.md` | Windows ポート詳細設計(compat API 仕様、CUDA 検出、NVMe フロー) | + +--- + +## 成果物 + +- `git log --oneline`: `de08889 feat: initial commit — cross-platform LLM inference scheduler` +- `git remote -v`: `origin https://github.com/zapabob/hypura.git` +- Rust コードレベル警告: 0 件(cargo check フィルタ確認済み) diff --git a/_docs/2026-03-23_libclang-windows-bindgen-fix_claude-sonnet-4-6.md b/_docs/2026-03-23_libclang-windows-bindgen-fix_claude-sonnet-4-6.md new file mode 100644 index 0000000..11ce1a1 --- /dev/null +++ b/_docs/2026-03-23_libclang-windows-bindgen-fix_claude-sonnet-4-6.md @@ -0,0 +1,120 @@ +# 2026-03-23 Windows 環境での libclang 未検出エラー修正 & pre-generated bindings サポート + +**実装AI:** Claude Sonnet 4.6 +**日付:** 2026-03-23 +**カテゴリ:** バグ修正・Windows ビルド + +--- + +## 症状 + +Windows ネイティブ環境で `cargo build` を実行すると `hypura-sys` のビルドが失敗: + +``` +thread 'main' panicked at ...bindgen... +Unable to find libclang: "couldn't find any valid shared libraries matching: +['clang.dll', 'libclang.dll'], searching paths: [...]" +note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace +warning: build failed, waiting for other jobs to finish... +``` + +--- + +## 根本原因 + +`hypura-sys/build.rs` の bindgen (v0.71.1) は `clang-sys` クレート経由で実行時に +`clang.dll` / `libclang.dll` を動的ロードする。 + +Windows に LLVM/Clang がインストールされておらず、かつ `.cargo/config.toml` に +`LIBCLANG_PATH` が設定されていなかったため、パニックしてビルドが中断していた。 + +--- + +## 修正内容 + +### 1. `.cargo/config.toml` に `LIBCLANG_PATH` を追加 + +**ファイル:** `.cargo/config.toml` + +```toml +[env] +LIBCLANG_PATH = "C:\\Program Files\\LLVM\\bin" +``` + +`winget install LLVM.LLVM` でインストールされる標準パスを指定。 +これにより `bindgen` が `libclang.dll` を自動検出できるようになる。 + +### 2. `hypura-sys/build.rs` に pre-generated bindings fallback を追加 + +**ファイル:** `hypura-sys/build.rs` — bindgen セクション (旧行 123〜171) + +bindgen を呼ぶ前に以下の優先順で既存の bindings.rs を探す仕組みを追加: + +1. 環境変数 `HYPURA_PREGENERATED_BINDINGS=/path/to/bindings.rs` が設定されている場合 +2. `hypura-sys/bindings.rs` がソースツリーに存在する場合 +3. 上記いずれも無ければ従来通り bindgen で生成 + +```rust +let pregenerated = env::var("HYPURA_PREGENERATED_BINDINGS") + .map(PathBuf::from) + .ok() + .or_else(|| { + let p = PathBuf::from(&manifest_dir).join("bindings.rs"); + if p.exists() { Some(p) } else { None } + }); + +if let Some(src) = pregenerated { + std::fs::copy(&src, out_path.join("bindings.rs")) + .expect("Failed to copy pre-generated bindings"); + println!("cargo:warning=Using pre-generated bindings from {}", src.display()); +} else { + // ... bindgen::Builder::default()...generate() ... +} +``` + +また、bindgen 失敗時のパニックメッセージを改善: + +``` +Failed to generate bindings — install LLVM and set LIBCLANG_PATH, +or provide HYPURA_PREGENERATED_BINDINGS=/path/to/bindings.rs +``` + +--- + +## 運用フロー (pre-generated bindings をコミットする場合) + +LLVM を一度インストールしてビルドが通った後、以下の手順で bindings.rs をコミットしておくと +LLVM なしの環境 (CI / 他開発者) でもビルド可能になる: + +```sh +# OUT_DIR を調べる +cargo build --message-format=json 2>/dev/null \ + | grep -o '"out_dir":"[^"]*"' | head -1 + +# bindings.rs をソースツリーにコピー +cp /bindings.rs hypura-sys/bindings.rs + +# コミット +git add hypura-sys/bindings.rs +git commit -m "feat(build): commit pre-generated bindings for LLVM-free builds" +``` + +--- + +## 影響範囲 + +| ファイル | 変更種別 | 内容 | +|----------|----------|------| +| `.cargo/config.toml` | 追加 | `[env] LIBCLANG_PATH` セクション | +| `hypura-sys/build.rs` | 変更 | pre-generated bindings fallback ロジック + エラーメッセージ改善 | + +macOS / Metal ビルドへの影響なし。CUDA ビルドへの影響なし。 +既存の bindgen による生成フローは変更なし (libclang が存在する場合は従来通り動作)。 + +--- + +## 前提条件 + +- LLVM インストール: `winget install LLVM.LLVM` +- インストール先: `C:\Program Files\LLVM\` (デフォルト) +- `libclang.dll` の場所: `C:\Program Files\LLVM\bin\libclang.dll` diff --git a/_docs/2026-03-23_windows-wsl2-cuda-port_claude-sonnet-4-6.md b/_docs/2026-03-23_windows-wsl2-cuda-port_claude-sonnet-4-6.md new file mode 100644 index 0000000..4484cf5 --- /dev/null +++ b/_docs/2026-03-23_windows-wsl2-cuda-port_claude-sonnet-4-6.md @@ -0,0 +1,176 @@ +# 2026-03-23 Windows/WSL2/CUDA クロスプラットフォーム移植 + +**実装AI:** Claude Sonnet 4.6 +**日付:** 2026-03-23 +**カテゴリ:** アーキテクチャ移植・プラットフォーム対応 + +--- + +## 背景・動機 + +Hypura はもともと macOS/Apple Silicon 専用として設計されており、以下の macOS 固有 API に強依存していた: + +- `F_NOCACHE` (キャッシュバイパス I/O) +- `pread(2)` (オフセット指定読み取り) +- `mmap(MAP_ANON|MAP_PRIVATE)` / `munmap` / `madvise(MADV_FREE)` (匿名メモリ管理) +- `sysctlbyname` (ハードウェア情報取得) +- `posix_memalign` (アライメント付きメモリ確保) +- Metal GPU API + +ユーザーが Windows (RTX 3060+) / WSL2 環境での動作を要望し、**macOS サポートを維持したまま**クロスプラットフォーム対応を実施した。 + +--- + +## 実装内容 + +### 1. `src/io/compat.rs` — プラットフォーム抽象化レイヤー (新規作成) + +全プラットフォーム固有 I/O を単一 API の背後に隠蔽する核心ファイル。 + +``` +上位レイヤー (IoPool / NvmePrefetcher / iobench) + ↓ +src/io/compat.rs [統一 API] + ↓ 条件コンパイル分岐 +macOS impl | Linux/WSL2 impl | Windows impl +``` + +**実装した API:** + +| 関数 | macOS | Linux/WSL2 | Windows | +|---|---|---|---| +| `open_direct_fd` | `F_NOCACHE` + `fcntl` | `O_DIRECT` | `FILE_FLAG_NO_BUFFERING` + `CreateFileW` | +| `read_at_fd` | `pread(2)` | `pread(2)` | `ReadFile` + `OVERLAPPED` | +| `alloc_pages` | `mmap(MAP_ANON)` | `mmap(MAP_ANON)` | `VirtualAlloc` | +| `free_pages` | `munmap` | `munmap` | `VirtualFree(MEM_RELEASE)` | +| `advise_free_pages` | `madvise(MADV_FREE)` | `madvise(MADV_DONTNEED)` | `VirtualFree(MEM_DECOMMIT)` | + +**型エイリアス:** +```rust +#[cfg(unix)] pub type NativeFd = i32; +#[cfg(windows)] pub type NativeFd = isize; // HANDLE +``` + +### 2. `src/io/aligned_buffer.rs` — クロスプラットフォーム書き換え + +`posix_memalign`(POSIX 専用)→ `std::alloc::Layout` + `std::alloc::alloc/dealloc`(全 OS 対応) + +```rust +pub struct AlignedBuffer { + ptr: *mut u8, + len: usize, + layout: Layout, +} +``` + +`FILE_FLAG_NO_BUFFERING` はバッファアドレス/サイズがセクターサイズ(4096 バイト)アライメントを要求するため、`AlignedBuffer::new(size, 4096)` がこれを保証する。 + +### 3. `hypura-sys/build.rs` — CMake ビルドの三分岐 + +``` +target_os == "macos" → Metal (GGML_METAL=ON) +非 macOS + CUDA 検出 → CUDA (GGML_CUDA=ON, sm_75;86;89;90) +非 macOS + CUDA なし → CPU のみ +``` + +**CUDA 検出順序:** +1. 環境変数 `CUDA_PATH` +2. `/usr/local/cuda` +3. `/usr/cuda` +4. PATH 内の `nvcc` の親ディレクトリ + +**CUDA アーキテクチャ:** +``` +sm_75 → RTX 20xx (Turing) +sm_86 → RTX 30xx (Ampere): RTX 3060, 3070, 3080, 3090 +sm_89 → RTX 40xx (Ada): RTX 4070, 4080, 4090 +sm_90 → H100 (Hopper) +``` +`HYPURA_CUDA_ARCHITECTURES=86` 環境変数で特定アーキテクチャのみビルド可能。 + +### 4. `hypura-sys/src/hypura_buft.c` — Windows メモリ対応 + +```c +#ifdef _WIN32 +static void *platform_alloc_pages(size_t size) { + return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); +} +static void platform_free_pages(void *addr, size_t size) { + (void)size; if (addr) VirtualFree(addr, 0, MEM_RELEASE); +} +#else +static void *platform_alloc_pages(size_t size) { + void *p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0); + return (p == MAP_FAILED) ? NULL : p; +} +#endif +``` + +### 5. `src/profiler/cpu.rs` — CPU 検出の三分岐 + +| OS | モデル名取得 | コア数取得 | +|---|---|---| +| macOS | `sysctlbyname("hw.model")` | `sysctlbyname("hw.physicalcpu")` | +| Linux | `/proc/cpuinfo` | `sysinfo` クレート | +| Windows | `sysinfo` クレート | `sysinfo` クレート | + +x86_64: `std::is_x86_feature_detected!("avx2")` / `"avx512f"` で SIMD 検出 + +### 6. `src/profiler/gpu.rs` — NVIDIA GPU スペック DB 追加 + +CUDA には Metal のような動的帯域幅クエリ API がないため、既知 GPU の理論値テーブルをハードコード: + +```rust +NvidiaSpec { pattern: "RTX 3080 Ti", bandwidth_gb_s: 912.0, fp16_tflops: 65.0 }, +NvidiaSpec { pattern: "RTX 3080 12GB", bandwidth_gb_s: 912.0, fp16_tflops: 60.0 }, +NvidiaSpec { pattern: "RTX 3080", bandwidth_gb_s: 760.0, fp16_tflops: 59.0 }, +NvidiaSpec { pattern: "RTX 3060", bandwidth_gb_s: 360.0, fp16_tflops: 25.4 }, +// ... RTX 20/30/40/50, A100, H100, L40S +``` + +### 7. `src/scheduler/placement.rs` — OS 別オーバーヘッド定数 + +```rust +#[cfg(target_os = "macos")] const OS_OVERHEAD: u64 = 2 * (1 << 30); // 2 GB +#[cfg(target_os = "windows")] const OS_OVERHEAD: u64 = 4 * (1 << 30); // 4 GB (VRAM ドライバ等) +#[cfg(all(not(...), not(...)))] const OS_OVERHEAD: u64 = 1 * (1 << 30); // 1 GB +#[cfg(target_os = "macos")] const GPU_RUNTIME_OVERHEAD: u64 = 1 << 30; // Metal: 1 GB +#[cfg(not(target_os = "macos"))] const GPU_RUNTIME_OVERHEAD: u64 = 512 * (1 << 20); // CUDA: 512 MB +``` + +### 8. `Cargo.toml` — Windows 条件付き依存関係 + +```toml +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.59", features = [ + "Win32_Foundation", "Win32_Storage_FileSystem", + "Win32_System_IO", "Win32_System_Memory", +] } +``` + +### 9. 各モジュールの compat 移行 + +| ファイル | 変更内容 | +|---|---| +| `src/compute/nvme_backend.rs` | 全 libc I/O → `compat` モジュール、`NativeFd` 型使用 | +| `src/io/async_reader.rs` | `F_NOCACHE` + `pread` → `compat::open_direct_fd` + `read_at_fd` | +| `src/cli/iobench.rs` | 全テストバリアントを `compat` ベースに書き換え | +| `src/compute/inference.rs` | `sysctl` 呼び出し → `sysinfo` クレートベースに非 macOS 分岐 | +| `src/profiler/storage.rs` | Unix/Windows ストレージ計測を分岐 | +| `src/profiler/mod.rs` | `data_dir()`: Windows `%APPDATA%\Hypura` / Unix `~/.hypura` | + +--- + +## 動作確認 + +- CUDA Toolkit 検出: RTX 3060 (sm_86) で確認 +- `highs-sys` (HiGHS LP ソルバ): Windows MSVC でビルド成功 +- Rust コードレベル警告: 0 件 + +--- + +## 既知の制限事項 + +- `vendor/llama.cpp` サブモジュール初期化後に `git submodule update --init --recursive` が必要 +- AMD ROCm / Intel Arc 未対応(将来対応可能な設計) +- WSL2 側は `posix_fadvise(POSIX_FADV_DONTNEED)` パス(Linux コードパス) diff --git a/_docs/2026-03-24_windows-build-recovery{windows-cuda-port}.md b/_docs/2026-03-24_windows-build-recovery{windows-cuda-port}.md new file mode 100644 index 0000000..05a7114 --- /dev/null +++ b/_docs/2026-03-24_windows-build-recovery{windows-cuda-port}.md @@ -0,0 +1,92 @@ +# 2026-03-24 Windows Build Recovery Log + +## Context +- Repository: `C:\Users\downl\Desktop\hypura-main\hypura-main` +- Branch: `windows-cuda-port` +- Plan: Hypura Windows 復旧・検証計画 + +## Phase 1: Single-lane build environment +- Stopped/verified absence of build-related processes (`cargo`, `rustc`, `cmake`, `msbuild`, `sccache`, `hypura`). +- Switched to isolated run IDs and target directories: + - `run-20260324-0124` -> `target-codex-run-20260324-0124` + - `run-20260324-0215` -> `target-codex-run-20260324-0215` + - `run-20260324-0245` -> `target-codex-run-20260324-0245` + +## Phase 2: Repro build and debug evidence +- Reproduced prior failure once: + - `rustversion v1.0.22` build script execute failure + - `os error 5` (Access denied) +- Confirmed executable existence and direct execution capability: + - `target-codex-run-20260324-0215\release\build\rustversion-... \build-script-build.exe` + - Direct run executes and panics with `OUT_DIR not set` (expected when not run under Cargo), proving file is executable. +- Captured `build.rs` debug evidence in `debug-4ee339.log` for run IDs: + - `run-20260324-0215` + - `run-20260324-0245` +- Verified key runtime-related fields in debug log: + - `PROFILE=release` for release runs + - `CMAKE_BUILD_TYPE=Release` + - `CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDLL` + - `LLAMA_BUILD_TOOLS=OFF` + +## Phase 3: PR delta classification +- Shareable code deltas (candidate for PR): + - `hypura-sys/build.rs` + - `src/io/compat.rs` + - `src/profiler/cpu.rs` + - `hypura-sys/Cargo.toml`, `Cargo.lock` (if required by dependency/build changes) +- Local-only / non-PR artifacts: + - `_docs/*` handoff/restart notes + - `debug-4ee339.log` + - `target-codex-*` directories + - `.specstory/` +- Branch/remotes confirmed: + - branch: `windows-cuda-port` + - remotes: `origin`, `upstream` + +## Phase 4: Smoke test (`/`, `/api/tags`, `/api/generate`) +- Started `serve` using existing binary: + - `target-codex\release\hypura.exe serve --port 8080` +- Observed hardware profiling and model loading logs, including CUDA detection and tensor loading. +- Also tried reduced context: + - `--context 1024` +- Result: + - API bind confirmation (`Hypura serving ... Endpoint: http://127.0.0.1:8080`) was not reached. + - `Invoke-WebRequest http://127.0.0.1:8080/` returned connection failure during observed runs. + - Process terminated during/after model load in both attempts. + +## Current blockers +- Intermittent artifact lock behavior still occurs on reused target dirs (`Blocking waiting for file lock on artifact directory`). +- `serve` process exits before API endpoint becomes reachable (requires next-step root-cause capture around post-load stage). + +## Suggested next actions +1. Force strictly fresh target dir per attempt and avoid reusing a locked `target-codex-run-*`. +2. Capture final crash reason for `serve` right after tensor-load completion (stdout/stderr redirection to dedicated file). +3. Once API bind line appears, run smoke in order: + - `/` + - `/api/tags` + - `/api/generate` + +## Follow-up execution (same day) +- Fresh target build attempt executed with: + - `run-20260324-022336` + - `target-codex-run-20260324-022336` + - build logs: `_docs/logs/build-run-20260324-022336.log` and `_docs/logs/build-run-20260324-022336.err.log` +- Serve was re-run with stdout/stderr redirection to dedicated files: + - `_docs/logs/serve-run-serve-024443.out.log` + - `_docs/logs/serve-run-serve-024443.err.log` +- Crash hypothesis update: + - No process crash observed on redirected run. + - `Hypura serving ... Endpoint: http://127.0.0.1:8080` was confirmed in output. + - Earlier "terminated during model load" behavior was not reproduced in this redirected run. + +## Final smoke result (`/` -> `/api/tags` -> `/api/generate`) +- Smoke run ID: `run-smoke-024730` +- Logs: + - `_docs/logs/serve-run-smoke-024730.out.log` + - `_docs/logs/serve-run-smoke-024730.err.log` +- Endpoint checks: + - `/` -> `200` with body `{"status":"ok"}` + - `/api/tags` -> `200` and loaded model listed + - `/api/generate` -> success response (`done: true`) +- Note: + - Generation content quality is not part of this smoke; API transport and completion path were confirmed. diff --git a/_docs/README.md b/_docs/README.md new file mode 100644 index 0000000..a911dbe --- /dev/null +++ b/_docs/README.md @@ -0,0 +1,34 @@ +# _docs — Hypura 実装ドキュメント / Implementation Documentation + +このディレクトリには Hypura の設計決定・実装詳細・移植記録が含まれています。 + +This directory contains design decisions, implementation details, and porting records for Hypura. + +--- + +## ファイル一覧 / Files + +### 設計ドキュメント + +| ファイル | 内容 | +|---|---| +| [implementation-log.md](./implementation-log.md) | 全実装の時系列ログ(スキャフォールドから Windows ポートまで)| +| [windows-wsl2-port.md](./windows-wsl2-port.md) | Windows/WSL2 + CUDA ポートの詳細設計ドキュメント | + +### 実装ログ (`yyyy-mm-dd_{内容}_{実装AI}.md`) + +| ファイル | 内容 | 実装AI | +|---|---|---| +| [2026-03-23_windows-wsl2-cuda-port_claude-sonnet-4-6.md](./2026-03-23_windows-wsl2-cuda-port_claude-sonnet-4-6.md) | Windows/WSL2/CUDA クロスプラットフォーム移植 | Claude Sonnet 4.6 | +| [2026-03-23_initial-commit-readme-bilingual_claude-sonnet-4-6.md](./2026-03-23_initial-commit-readme-bilingual_claude-sonnet-4-6.md) | 初期コミット・README 日英併記リライト・型警告 0 | Claude Sonnet 4.6 | +| [2026-03-23_dunce-unc-path-fix_claude-sonnet-4-6.md](./2026-03-23_dunce-unc-path-fix_claude-sonnet-4-6.md) | Windows UNC パス問題 (dunce) + Avast ブロック対処 | Claude Sonnet 4.6 | + +--- + +## 目的 / Purpose + +- 将来の開発者(および LLM)が設計決定の背景を理解できるようにする +- プラットフォーム固有の実装詳細を一箇所に集約する +- ポート作業のアーキテクチャ上の理由を記録する + +These docs exist so future developers (and LLMs) can understand the *why* behind design decisions, have a single place for platform-specific implementation details, and trace the architectural rationale of porting work. diff --git a/_docs/implementation-log.md b/_docs/implementation-log.md new file mode 100644 index 0000000..ff1070c --- /dev/null +++ b/_docs/implementation-log.md @@ -0,0 +1,170 @@ +# Hypura 実装ログ / Implementation Log + +時系列順の実装記録。各フェーズの設計決定・根拠・重要な詳細を含む。 + +--- + +## Phase 1: スキャフォールド・FFI + +**実装内容:** +- Cargo ワークスペース構成(`hypura` クレート + `hypura-sys` クレート) +- llama.cpp を `vendor/llama.cpp/` に git submodule として vendored +- `hypura-sys/build.rs` で CMake ビルド統合 +- `hypura-sys/src/lib.rs` で llama.cpp C API の Rust bindgen バインディング +- カスタム GGML バッファ型 `hypura_buft.c` のスキャフォールド + +**設計決定:** +- `hypura-sys` を別クレートに分離: bindgen の再生成と C/Rust 境界を明確に分離するため +- CMake + vendored llama.cpp: システムインストールの llama.cpp に依存しないようにするため + +--- + +## Phase 2: ハードウェアプロファイラ + +**実装内容: `src/profiler/`** + +- `cpu.rs`: CPU モデル名・コア数・SIMD 機能(AVX2/AVX-512)の検出 + - macOS: `sysctlbyname` で `hw.model`、`hw.physicalcpu`、`hw.perflevel0.logicalcpu` + - Linux: `/proc/cpuinfo` でモデル名、`sysinfo` クレートでコア数 + - Windows: `sysinfo` クレートで両方 + - x86_64: `std::is_x86_feature_detected!("avx2")` / `"avx512f"` + +- `gpu.rs`: GPU スペック検出 + - macOS: Metal device API で `recommendedMaxWorkingSetSize`、`maxTransferRate` + - CUDA (non-macOS): NVIDIA GPU スペック DB(RTX 20/30/40/50 + A/H シリーズ) + +- `storage.rs`: NVMe スループット計測(キャッシュバイパス読み取り) + - macOS: `F_NOCACHE` + `pread` + - Linux/WSL2: `posix_fadvise(DONTNEED)` + `pread` + - Windows: `std::io::Read + Seek`(後の Windows ポートで `FILE_FLAG_NO_BUFFERING` に更新予定) + +- `mod.rs`: プロファイル集約、`data_dir()` パス解決 + - macOS/Linux: `~/.hypura/` + - Windows: `%APPDATA%\Hypura\` + +**設計決定:** +- プロファイル結果は JSON でキャッシュ: 毎起動の計測オーバーヘッドを避けるため +- NVIDIA GPU スペック DB: CUDA には Metal のような動的な帯域幅クエリ API がないため + +--- + +## Phase 3: テンソル配置最適化 + +**実装内容: `src/scheduler/placement.rs`** + +- LP(線形計画)+ greedy フォールバックによるテンソル → 階層割り当て +- `good_lp` クレート + HiGHS ソルバを使用 +- テンソルスコアリング: `src/model/tensor_role.rs` でテンソルの役割(norm、attention、MoE expert、FFN等)を分類 +- プラットフォーム別定数: + ```rust + #[cfg(target_os = "macos")] const OS_OVERHEAD: u64 = 2 * (1 << 30); // 2 GB + #[cfg(target_os = "windows")] const OS_OVERHEAD: u64 = 4 * (1 << 30); // 4 GB + #[cfg(all(not(target_os = "macos"), not(target_os = "windows")))] const OS_OVERHEAD: u64 = 1 * (1 << 30); + #[cfg(target_os = "macos")] const GPU_RUNTIME_OVERHEAD: u64 = 1 << 30; // Metal: 1 GB + #[cfg(not(target_os = "macos"))] const GPU_RUNTIME_OVERHEAD: u64 = 512 * (1 << 20); // CUDA: 512 MB + ``` + +**設計決定:** +- Windows の OS_OVERHEAD を 4 GB に設定: VRAM オーバーレイドライバや DirectX ランタイムが macOS/Linux より多くのシステムメモリを消費するため + +--- + +## Phase 4: NVMe バックエンド + +**実装内容: `src/compute/nvme_backend.rs`** + +- `IoPool`: ワーカースレッド群、各ワーカーが専用のダイレクト I/O ファイルディスクリプタを保持 +- バリアベースの完了同期 +- リージョンをワーカー間で分割して並列読み取り +- カスタム GGML バッファ型: llama.cpp のバッファアロケーションにフック、テンソルローディングをインターセプト +- `hypura_buft.c`: `platform_alloc_pages` / `platform_free_pages` でプラットフォーム抽象化 + +**設計決定:** +- ワーカーごとに個別の fd: macOS の `F_NOCACHE` は fd レベルの属性であり、スレッド間で共有不可のため +- ダイレクト I/O: OS ページキャッシュをバイパスすることで、ページキャッシュが LLM ウェイトで汚染されるのを防ぐ + +--- + +## Phase 5: キャッシュ層 + +**実装内容:** + +- `src/cache/coactivation.rs`: エキスパート共活性化追跡 + - 同一層・クロスレイヤー共活性化行列 + - `~/.hypura/coactivation/` への永続化 + - 投機的プリフェッチへの統合 + +- `src/cache/kv_cache.rs`: ウィンドウ付き KV キャッシュ圧縮 + - GPU バジェットが逼迫時の自動 Q8 選択 + - `llama_memory_seq_rm` によるウィンドウ圧縮 + +- `src/cache/neuron_cache.rs`: 読み込み済みエキスパートスライスの LRU キャッシュ + - Mixtral での 99.5% ヒット率を達成 + +--- + +## Phase 6: 推論エンジン + +**実装内容: `src/compute/inference.rs`** + +- `generate_blocking`: ベースライン推論(mmap またはフル GPU 常駐) +- `generate_with_nvme_scheduling`: 階層推論(NVMe ストリーミング + GPU 常駐) +- サーバー向け: `load_model` / `generate_from_loaded`(モデルを常駐させて複数リクエストを処理) + +**プラットフォーム対応:** +```rust +fn total_physical_memory() -> u64 { + #[cfg(target_os = "macos")] + { /* hw.memsize sysctl */ } + #[cfg(not(target_os = "macos"))] + { sysinfo::System::new_all().total_memory() } +} +``` + +--- + +## Phase 7: Ollama 互換サーバー + +**実装内容: `src/server/routes.rs`, `src/cli/serve.rs`** + +- Axum HTTP フレームワーク +- `POST /api/generate`, `POST /api/chat` でストリーミング NDJSON +- `GET /api/tags`, `GET /api/version`, `POST /api/show` +- CORS 対応(tower-http) + +--- + +## Phase 8: `hypura optimize` — TSP エキスパートレイアウト最適化 + +**実装内容: `src/cli/optimize.rs`** + +- Greedy TSP でエキスパートテンソルを共活性化順に並べ替え +- サイドカー `.permutations.json` で元ファイルを変更せずにレイアウトを記録 +- 共活性化行列が蓄積された後に実行することで効果を最大化 + +--- + +## Phase 9: Windows/WSL2/CUDA ポート (2026-03-23) + +**背景:** Hypura は当初 macOS/Apple Silicon 専用として設計されていた。Windows ネイティブ + WSL2 + CUDA(RTX 3060 以上)への対応を実施。 + +**詳細:** `windows-wsl2-port.md` を参照。 + +**主な変更ファイル:** + +| ファイル | 変更内容 | +|---|---| +| `src/io/compat.rs` | 新規作成: プラットフォーム抽象化 API | +| `src/io/mod.rs` | `pub mod compat;` 追加 | +| `src/io/aligned_buffer.rs` | `std::alloc::Layout` ベースに書き換え(`posix_memalign` 廃止)| +| `hypura-sys/build.rs` | CUDA/Metal/CPU 対応の三分岐ビルドロジック | +| `hypura-sys/src/hypura_buft.c` | `#ifdef _WIN32` による `VirtualAlloc/VirtualFree` 対応 | +| `src/profiler/cpu.rs` | macOS sysctl / Linux procfs / Windows sysinfo の三分岐 | +| `src/profiler/gpu.rs` | NVIDIA GPU スペック DB 追加(RTX 20/30/40/50 + A/H シリーズ)| +| `src/profiler/storage.rs` | Windows `std::io::Read+Seek` フォールバック追加 | +| `src/compute/nvme_backend.rs` | 全 libc I/O を `compat` モジュール経由に置換 | +| `src/compute/inference.rs` | sysctl 呼び出しを非 macOS フォールバックに置換 | +| `src/io/async_reader.rs` | `compat` モジュール使用に書き換え | +| `src/cli/iobench.rs` | `compat` モジュール使用に書き換え | +| `src/scheduler/placement.rs` | OS_OVERHEAD / GPU_RUNTIME_OVERHEAD を OS 別に分岐 | +| `Cargo.toml` | `windows-sys` 条件付き依存関係を追加 | diff --git a/_docs/windows-wsl2-port.md b/_docs/windows-wsl2-port.md new file mode 100644 index 0000000..08832a7 --- /dev/null +++ b/_docs/windows-wsl2-port.md @@ -0,0 +1,233 @@ +# Windows/WSL2 + CUDA ポート 詳細設計ドキュメント + +**実施日:** 2026-03-23 +**担当:** LLM-directed development (Claude Sonnet 4.6) + +--- + +## 動機 + +Hypura は当初 macOS/Apple Silicon 専用として設計されていた。具体的には以下の macOS 固有 API に強く依存していた: + +- `F_NOCACHE` フラグ(キャッシュバイパス I/O) +- `pread(2)` システムコール(オフセット指定読み取り) +- `mmap(MAP_ANON|MAP_PRIVATE)` / `munmap`(匿名メモリ) +- `madvise(MADV_FREE)`(ページ解放ヒント) +- `sysctlbyname`(ハードウェア情報取得) +- `posix_memalign`(アライメント付きメモリ確保) +- Metal GPU API + +目標: **Windows ネイティブおよび WSL2 上での CUDA (RTX 3060 以上) での動作**を、既存の macOS サポートを破壊することなく実現する。 + +--- + +## 設計方針 + +### 1. 単一の抽象化レイヤー + +すべてのプラットフォーム固有 I/O を `src/io/compat.rs` の単一 API の背後に隠蔽する。上位レイヤー(IoPool、NvmePrefetcher、iobench)はこの API のみを使用する。 + +``` +IoPool / NvmePrefetcher / iobench + ↓ uses +src/io/compat.rs [NativeFd, open_direct_fd, read_at_fd, alloc_pages, ...] + ↓ branches to +macOS impl | Linux/WSL2 impl | Windows impl +``` + +### 2. 条件コンパイルの粒度 + +- `#[cfg(target_os = "macos")]` / `#[cfg(unix)]` / `#[cfg(windows)]` を `compat.rs` 内に集約 +- 上位レイヤーでの `#[cfg]` ブロックは最小限に抑える(`placement.rs` のオーバーヘッド定数のみ) + +### 3. 型安全な fd ラッパー + +```rust +#[cfg(unix)] +pub type NativeFd = i32; + +#[cfg(windows)] +pub type NativeFd = isize; // HANDLE +``` + +`-1` (Unix の無効 fd) と `INVALID_HANDLE_VALUE` (Windows の無効ハンドル) を型レベルで区別。 + +--- + +## プラットフォーム抽象化 API (`src/io/compat.rs`) + +### `open_direct_fd(path: &Path) -> std::io::Result` + +キャッシュバイパスモードでファイルを開く。 + +| OS | 実装 | +|---|---| +| macOS | `open(O_RDONLY)` + `fcntl(F_NOCACHE, 1)` | +| Linux/WSL2 | `open(O_RDONLY \| O_DIRECT)` | +| Windows | `CreateFileW(FILE_FLAG_NO_BUFFERING \| FILE_FLAG_OVERLAPPED)` | + +### `close_fd(fd: NativeFd)` + +ファイルディスクリプタ/ハンドルを閉じる。 + +| OS | 実装 | +|---|---| +| Unix | `libc::close(fd)` | +| Windows | `CloseHandle(fd as HANDLE)` | + +### `read_at_fd(fd: NativeFd, dst: *mut u8, size: usize, offset: u64) -> isize` + +指定オフセットから `size` バイト読み取る(非シーク型、スレッドセーフ)。 + +| OS | 実装 | +|---|---| +| Unix | `libc::pread(fd, dst, size, offset)` | +| Windows | `ReadFile` + `OVERLAPPED { Offset, OffsetHigh }` | + +**Windows の注意点:** `FILE_FLAG_NO_BUFFERING` を使用する場合、バッファのアドレスとサイズはセクターサイズ(通常 512 バイトまたは 4096 バイト)のアライメントが必要。`AlignedBuffer` が 4096 バイトアライメントを保証する。 + +### `alloc_pages(size: usize) -> *mut u8` + +匿名メモリページを確保する(NVMe バッファ用)。 + +| OS | 実装 | +|---|---| +| macOS/Linux | `mmap(NULL, size, PROT_READ\|PROT_WRITE, MAP_ANON\|MAP_PRIVATE, -1, 0)` | +| Windows | `VirtualAlloc(NULL, size, MEM_COMMIT \| MEM_RESERVE, PAGE_READWRITE)` | + +### `free_pages(ptr: *mut u8, size: usize)` + +| OS | 実装 | +|---|---| +| Unix | `munmap(ptr, size)` | +| Windows | `VirtualFree(ptr, 0, MEM_RELEASE)` | + +### `advise_free_pages(ptr: *mut u8, size: usize)` + +ページを OS に返却するヒントを与える(レイヤー解放後に呼び出す)。 + +| OS | 実装 | +|---|---| +| macOS | `madvise(ptr, size, MADV_FREE)` | +| Linux/WSL2 | `madvise(ptr, size, MADV_DONTNEED)` | +| Windows | `VirtualFree(ptr, size, MEM_DECOMMIT)` | + +**重要:** `MEM_DECOMMIT` は物理ページをコミット解除するが仮想アドレス範囲は保持する。次回アクセス時にページフォルトが発生し、OS がゼロページを割り当てる。これが `MADV_FREE`/`MADV_DONTNEED` のセマンティクスに最も近い Windows の等価物。 + +--- + +## AlignedBuffer の書き換え + +**変更前:** `posix_memalign`(POSIX 専用) +**変更後:** `std::alloc::Layout` + `std::alloc::alloc` / `dealloc`(クロスプラットフォーム) + +```rust +pub struct AlignedBuffer { + ptr: *mut u8, + len: usize, + layout: Layout, +} + +impl AlignedBuffer { + pub fn new(len: usize, alignment: usize) -> std::io::Result { + let layout = Layout::from_size_align(len, alignment) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?; + let ptr = unsafe { alloc(layout) }; + if ptr.is_null() { + return Err(std::io::Error::new(std::io::ErrorKind::OutOfMemory, "alloc failed")); + } + Ok(Self { ptr, len, layout }) + } +} +``` + +--- + +## CUDA ビルド設定 (`hypura-sys/build.rs`) + +### 検出フロー + +``` +target_os == "macos" + → Metal ビルド (GGML_METAL=ON, framework Metal/Foundation/QuartzCore) +else + → CUDA 検出: + 1. env CUDA_PATH + 2. /usr/local/cuda (Linux/WSL2 デフォルト) + 3. /usr/cuda + 4. PATH 内の nvcc の親ディレクトリ + → CUDA 利用可能 + → GGML_CUDA=ON, CMAKE_CUDA_ARCHITECTURES="75;86;89;90" + → CUDA 不可 + → CPU のみ (GGML_CUDA=OFF, GGML_METAL=OFF) +``` + +### CUDA アーキテクチャ + +| GPU シリーズ | sm_ | 代表例 | +|---|---|---| +| RTX 20xx (Turing) | sm_75 | RTX 2060, 2070, 2080 | +| RTX 30xx (Ampere) | sm_86 | RTX 3060, 3070, 3080, 3090 | +| RTX 40xx (Ada) | sm_89 | RTX 4070, 4080, 4090 | +| H100 (Hopper) | sm_90 | H100 SXM/PCIe | +| A100 (Ampere) | sm_80 | A100 | +| L40S (Ada) | sm_89 | L40S | + +デフォルト: `"75;86;89;90"`(RTX 2060 以上のすべてをカバー) + +カスタマイズ: `HYPURA_CUDA_ARCHITECTURES=86` 環境変数で特定のアーキテクチャのみビルド(ビルド時間短縮) + +--- + +## NVIDIA GPU スペック DB (`src/profiler/gpu.rs`) + +CUDA には Metal のような動的な GPU 帯域幅クエリ API がない。そのため、既知の GPU モデルの理論値テーブルをハードコードしている。 + +`lookup_nvidia_gpu(name: &str) -> Option<(bandwidth_bytes_per_sec, fp16_tflops)>` + +名前マッチングは部分文字列検索(`contains`)で行う。未知の GPU は `estimate_nvidia_gpu(vram_bytes)` で VRAM 容量から推定値を返す。 + +**RTX 3060 の仕様:** +- メモリ帯域幅: 360 GB/s +- FP16 演算性能: 25.4 TFLOPS + +--- + +## Windows NVMe ストリーミング 動作フロー + +``` +1. IoPool::new(model_path, num_workers) + → compat::open_direct_fd(model_path) + → Windows: CreateFileW(..., FILE_FLAG_NO_BUFFERING | FILE_FLAG_OVERLAPPED, ...) + → 各ワーカースレッドが専用 HANDLE を保持 + +2. IoPool::read_region(file_offset, size) + → リージョンを num_workers に分割 + → 各ワーカーが担当領域を pread_region() で読み取り + → compat::read_at_fd(handle, dst, size, offset) + → Windows: + OVERLAPPED ol = { .Offset = offset & 0xFFFFFFFF, + .OffsetHigh = offset >> 32 }; + ReadFile(handle, dst, size, &bytes_read, &ol); + GetOverlappedResult(handle, &ol, &bytes_read, TRUE); // 同期待機 + +3. バリアで全ワーカーの完了を待機 + +4. 推論完了後のレイヤー解放: + → compat::advise_free_pages(ptr, size) + → Windows: VirtualFree(ptr, size, MEM_DECOMMIT) + → 物理ページを OS に返却、仮想アドレスは保持 + → 次回アクセス時にゼロページが割り当てられ再読み込み +``` + +--- + +## 既知の制限事項 + +1. **`cargo test --lib` の一部テストが GGUF ファイルを必要とする** — テストモデルは `test-models/` に配置する必要がある(リポジトリには含まれない) + +2. **WSL2 での `vendor/llama.cpp` サブモジュール** — 初回ビルド前に `git submodule update --init --recursive` が必要 + +3. **Windows ネイティブでの `FILE_FLAG_NO_BUFFERING` のアライメント要件** — 読み取りサイズとバッファアドレスはセクターサイズの倍数である必要がある。`AlignedBuffer::new(size, 4096)` がこれを保証している。ただし、リードサイズがセクターサイズに満たない場合(最後のチャンク等)は内部でパディングが発生する + +4. **CUDA のみ対応** — AMD ROCm / Intel Arc は未対応(将来対応可能な設計にはなっている) diff --git a/hypura-sys/Cargo.toml b/hypura-sys/Cargo.toml index 333c557..e353302 100644 --- a/hypura-sys/Cargo.toml +++ b/hypura-sys/Cargo.toml @@ -9,3 +9,5 @@ links = "llama" cmake = "0.1" cc = "1" bindgen = "0.71" +dunce = "1" +serde_json = "1" diff --git a/hypura-sys/build.rs b/hypura-sys/build.rs index 6ddc4e0..9555a00 100644 --- a/hypura-sys/build.rs +++ b/hypura-sys/build.rs @@ -1,96 +1,286 @@ use std::env; +use std::fs::OpenOptions; +use std::io::Write; use std::path::PathBuf; fn main() { + // #region agent log + debug_log( + "h1", + "build_main_entry", + serde_json::json!({ + "manifest_dir": env::var("CARGO_MANIFEST_DIR").unwrap_or_default(), + "profile": env::var("PROFILE").unwrap_or_default(), + "target_os": env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(), + "target_env": env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default(), + "opt_level": env::var("OPT_LEVEL").unwrap_or_default(), + "cmake_generator": env::var("CMAKE_GENERATOR").unwrap_or_default(), + }), + ); + // #endregion + let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); let llama_dir = PathBuf::from(&manifest_dir).join("../vendor/llama.cpp"); - let llama_dir = llama_dir.canonicalize().expect( + // dunce::canonicalize strips the \\?\ UNC prefix that std::fs::canonicalize + // adds on Windows, which would otherwise cause MSBuild to reject source paths. + let llama_dir = dunce::canonicalize(&llama_dir).expect( "vendor/llama.cpp not found — run: git submodule update --init --recursive", ); - // Build llama.cpp via cmake - let dst = cmake::Config::new(&llama_dir) + let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + let use_metal = target_os == "macos"; + let use_cuda = !use_metal && cuda_is_available(); + + // ── Build llama.cpp via cmake ──────────────────────────────────────────── + let mut cmake_config = cmake::Config::new(&llama_dir); + cmake_config.profile("Release"); + cmake_config .define("BUILD_SHARED_LIBS", "OFF") + .define("CMAKE_BUILD_TYPE", "Release") .define("LLAMA_BUILD_TESTS", "OFF") .define("LLAMA_BUILD_EXAMPLES", "OFF") + .define("LLAMA_BUILD_TOOLS", "OFF") .define("LLAMA_BUILD_SERVER", "OFF") - .define("GGML_METAL", "ON") - .define("GGML_METAL_EMBED_LIBRARY", "ON") .define("GGML_CPU", "ON") - .define("GGML_BLAS", "OFF") - .define("GGML_OPENMP", "OFF") - .build(); + .define("GGML_BLAS", "OFF"); + if target_os == "windows" { + // Force non-Debug CRT to avoid __imp__CrtDbgReport unresolved symbols. + cmake_config.define("CMAKE_MSVC_RUNTIME_LIBRARY", "MultiThreadedDLL"); + } + // #region agent log + debug_log( + "h2", + "cmake_base_defines", + serde_json::json!({ + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_MSVC_RUNTIME_LIBRARY": if target_os == "windows" { "MultiThreadedDLL" } else { "n/a" }, + "LLAMA_BUILD_TOOLS": "OFF", + "GGML_CPU": "ON", + }), + ); + // #endregion + + if use_metal { + // macOS / Apple Silicon — Metal GPU + cmake_config + .define("GGML_METAL", "ON") + .define("GGML_METAL_EMBED_LIBRARY", "ON") + .define("GGML_CUDA", "OFF") + .define("GGML_OPENMP", "OFF"); + } else if use_cuda { + // Windows / WSL2 / Linux — NVIDIA CUDA + // Target RTX 20xx (sm_75) and up through RTX 50xx / H100 (sm_120). + // "native" detects only the current machine's GPU; a fixed list enables + // building a binary that runs on multiple NVIDIA generations. + let cuda_arches = env::var("HYPURA_CUDA_ARCHITECTURES") + .unwrap_or_else(|_| "75;86;89;90".to_string()); + + cmake_config + .define("GGML_METAL", "OFF") + .define("GGML_CUDA", "ON") + .define("GGML_OPENMP", "ON") + .define("CMAKE_CUDA_ARCHITECTURES", cuda_arches); + + if let Some(nvcc) = find_nvcc() { + cmake_config.define("CMAKE_CUDA_COMPILER", nvcc.display().to_string()); + } + if let Some(cuda_root) = get_cuda_root() { + cmake_config.define("CUDAToolkit_ROOT", cuda_root.display().to_string()); + } + // #region agent log + debug_log( + "h3", + "cuda_path_resolution", + serde_json::json!({ + "cuda_root": get_cuda_root().map(|p| p.display().to_string()), + "nvcc": find_nvcc().map(|p| p.display().to_string()), + "arches": env::var("HYPURA_CUDA_ARCHITECTURES").ok(), + }), + ); + // #endregion + } else { + // CPU-only fallback + cmake_config + .define("GGML_METAL", "OFF") + .define("GGML_CUDA", "OFF") + .define("GGML_OPENMP", "ON"); + } + + // #region agent log + debug_log( + "h2", + "cmake_profile_selected", + serde_json::json!({ + "profile_api": "Release", + "note": "CMAKE_BUILD_TYPE is ignored by multi-config generators on Windows, profile() drives --config", + "cargo_profile_env": env::var("PROFILE").unwrap_or_default(), + "cargo_encoded_rustflags": env::var("CARGO_ENCODED_RUSTFLAGS").unwrap_or_default(), + }), + ); + // #endregion + let dst = cmake_config.build(); let lib_dir = dst.join("lib"); + // #region agent log + debug_log( + "h4", + "cmake_build_output", + serde_json::json!({ + "dst": dst.display().to_string(), + "lib_dir_exists": lib_dir.exists(), + }), + ); + // #endregion + // #region agent log + let cache_path = dst.join("build").join("CMakeCache.txt"); + let cache_snippet = std::fs::read_to_string(&cache_path) + .ok() + .map(|s| { + s.lines() + .filter(|l| { + l.starts_with("CMAKE_BUILD_TYPE:") + || l.starts_with("CMAKE_CONFIGURATION_TYPES:") + || l.starts_with("CMAKE_MSVC_RUNTIME_LIBRARY:") + }) + .map(|l| l.to_string()) + .take(8) + .collect::>() + }) + .unwrap_or_default(); + debug_log( + "h6", + "cmake_cache_runtime_settings", + serde_json::json!({ + "cache_path": cache_path.display().to_string(), + "cache_lines": cache_snippet, + }), + ); + // #endregion - // Link the static libraries produced by cmake + // ── Link the static libraries ──────────────────────────────────────────── println!("cargo:rustc-link-search=native={}", lib_dir.display()); println!("cargo:rustc-link-lib=static=llama"); println!("cargo:rustc-link-lib=static=ggml"); println!("cargo:rustc-link-lib=static=ggml-base"); println!("cargo:rustc-link-lib=static=ggml-cpu"); - println!("cargo:rustc-link-lib=static=ggml-metal"); - // Link macOS frameworks - println!("cargo:rustc-link-lib=framework=Metal"); - println!("cargo:rustc-link-lib=framework=Foundation"); - println!("cargo:rustc-link-lib=framework=MetalKit"); - println!("cargo:rustc-link-lib=framework=Accelerate"); + if use_metal { + println!("cargo:rustc-link-lib=static=ggml-metal"); + println!("cargo:rustc-link-lib=framework=Metal"); + println!("cargo:rustc-link-lib=framework=Foundation"); + println!("cargo:rustc-link-lib=framework=MetalKit"); + println!("cargo:rustc-link-lib=framework=Accelerate"); + println!("cargo:rustc-link-lib=c++"); + } else if use_cuda { + println!("cargo:rustc-link-lib=static=ggml-cuda"); + if let Some(lib_path) = get_cuda_lib_path() { + println!("cargo:rustc-link-search=native={}", lib_path.display()); + } + println!("cargo:rustc-link-lib=cuda"); + println!("cargo:rustc-link-lib=cublas"); + println!("cargo:rustc-link-lib=cudart"); + // #region agent log + debug_log( + "h5", + "rust_link_cuda_libs", + serde_json::json!({ + "link_libs": ["llama","ggml","ggml-base","ggml-cpu","ggml-cuda","cuda","cublas","cudart"], + "cuda_lib_path": get_cuda_lib_path().map(|p| p.display().to_string()), + }), + ); + // #endregion + if target_os == "linux" { + println!("cargo:rustc-link-lib=stdc++"); + } + // Windows: MSVC links its C++ runtime automatically. + } else if target_os == "linux" { + println!("cargo:rustc-link-lib=stdc++"); + } - // C++ standard library - println!("cargo:rustc-link-lib=c++"); + // Propagate feature flags to Rust code via cfg() + if use_metal { + println!("cargo:rustc-cfg=hypura_metal"); + } else if use_cuda { + println!("cargo:rustc-cfg=hypura_cuda"); + } - // Compile our custom buffer type C shim + // ── Compile the custom GGML buffer type C shim ─────────────────────────── let src_dir = PathBuf::from(&manifest_dir).join("src"); let include_ggml_internal = llama_dir.join("ggml/src"); - cc::Build::new() + + let mut cc_build = cc::Build::new(); + cc_build .file(src_dir.join("hypura_buft.c")) - .include(&llama_dir.join("include")) - .include(&llama_dir.join("ggml/include")) + .include(llama_dir.join("include")) + .include(llama_dir.join("ggml/include")) .include(&include_ggml_internal) - .include(&src_dir) - .flag("-std=c11") - .compile("hypura_buft"); + .include(&src_dir); + + // MSVC doesn't accept -std=c11; GCC/Clang do. + if target_os != "windows" { + cc_build.flag("-std=c11"); + } + cc_build.compile("hypura_buft"); + println!("cargo:rerun-if-changed=src/hypura_buft.c"); println!("cargo:rerun-if-changed=src/hypura_buft.h"); - // Generate Rust bindings via bindgen - let include_llama = llama_dir.join("include"); - let include_ggml = llama_dir.join("ggml/include"); - - let bindings = bindgen::Builder::default() - .header( - PathBuf::from(&manifest_dir) - .join("wrapper.h") - .to_str() - .unwrap() - .to_string(), - ) - .clang_arg(format!("-I{}", include_llama.display())) - .clang_arg(format!("-I{}", include_ggml.display())) - .clang_arg(format!("-I{}", src_dir.display())) - .allowlist_function("llama_.*") - .allowlist_function("ggml_.*") - .allowlist_function("gguf_.*") - .allowlist_function("hypura_.*") - .allowlist_type("llama_.*") - .allowlist_type("ggml_.*") - .allowlist_type("gguf_.*") - .allowlist_type("hypura_.*") - .allowlist_var("LLAMA_.*") - .allowlist_var("GGML_.*") - .allowlist_var("GGUF_.*") - .derive_debug(true) - .derive_default(true) - .generate() - .expect("Failed to generate bindings"); - + // ── Generate Rust bindings via bindgen ─────────────────────────────────── let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); - bindings - .write_to_file(out_path.join("bindings.rs")) - .expect("Failed to write bindings"); - // Rebuild if llama.cpp sources change + // Pre-generated bindings fallback: avoids needing libclang on every machine. + // Priority: HYPURA_PREGENERATED_BINDINGS env var > hypura-sys/bindings.rs in source tree. + let pregenerated = env::var("HYPURA_PREGENERATED_BINDINGS") + .map(PathBuf::from) + .ok() + .or_else(|| { + let p = PathBuf::from(&manifest_dir).join("bindings.rs"); + if p.exists() { Some(p) } else { None } + }); + + if let Some(src) = pregenerated { + std::fs::copy(&src, out_path.join("bindings.rs")) + .expect("Failed to copy pre-generated bindings"); + println!("cargo:warning=Using pre-generated bindings from {}", src.display()); + } else { + let include_llama = llama_dir.join("include"); + let include_ggml = llama_dir.join("ggml/include"); + + let bindings = bindgen::Builder::default() + .header( + PathBuf::from(&manifest_dir) + .join("wrapper.h") + .to_str() + .unwrap() + .to_string(), + ) + .clang_arg(format!("-I{}", include_llama.display())) + .clang_arg(format!("-I{}", include_ggml.display())) + .clang_arg(format!("-I{}", src_dir.display())) + .allowlist_function("llama_.*") + .allowlist_function("ggml_.*") + .allowlist_function("gguf_.*") + .allowlist_function("hypura_.*") + .allowlist_type("llama_.*") + .allowlist_type("ggml_.*") + .allowlist_type("gguf_.*") + .allowlist_type("hypura_.*") + .allowlist_var("LLAMA_.*") + .allowlist_var("GGML_.*") + .allowlist_var("GGUF_.*") + // MSVC/C bind differences can make bindgen layout asserts flaky on Windows. + .layout_tests(false) + .derive_debug(true) + .derive_default(true) + .generate() + .expect("Failed to generate bindings — install LLVM and set LIBCLANG_PATH, \ + or provide HYPURA_PREGENERATED_BINDINGS=/path/to/bindings.rs"); + + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Failed to write bindings"); + } + println!("cargo:rerun-if-changed=wrapper.h"); println!( "cargo:rerun-if-changed={}", @@ -105,3 +295,113 @@ fn main() { llama_dir.join("ggml").display() ); } + +fn debug_log(hypothesis_id: &str, message: &str, data: serde_json::Value) { + let log_path = env::var("CARGO_MANIFEST_DIR") + .ok() + .and_then(|p| PathBuf::from(p).parent().map(|pp| pp.join("debug-4ee339.log"))) + .unwrap_or_else(|| PathBuf::from("debug-4ee339.log")); + let payload = serde_json::json!({ + "sessionId": "4ee339", + "runId": env::var("HYPURA_DEBUG_RUN_ID").unwrap_or_else(|_| "pre-fix".to_string()), + "hypothesisId": hypothesis_id, + "location": "hypura-sys/build.rs", + "message": message, + "data": data, + "timestamp": std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0), + }); + if let Ok(mut f) = OpenOptions::new() + .create(true) + .append(true) + .open(log_path) + { + let _ = writeln!(f, "{}", payload); + } +} + +// ── CUDA detection helpers ──────────────────────────────────────────────────── + +fn cuda_is_available() -> bool { + // Explicit opt-out + if env::var("HYPURA_NO_CUDA").is_ok() { + return false; + } + // Explicit opt-in (useful in CI or when auto-detection fails) + if env::var("HYPURA_CUDA").is_ok() { + return true; + } + get_cuda_root().is_some() +} + +/// Return the CUDA toolkit root, trying common locations. +fn get_cuda_root() -> Option { + // Set by the Windows CUDA installer or by the user + if let Ok(p) = env::var("CUDA_PATH") { + let path = PathBuf::from(p); + if path.exists() { + return Some(path); + } + } + + // Linux / WSL2 default + for candidate in &["/usr/local/cuda", "/usr/cuda"] { + let p = PathBuf::from(candidate); + if p.exists() { + return Some(p); + } + } + + // If nvcc is on PATH, try to derive the root from it + if let Some(nvcc) = find_nvcc() { + if let Some(bin) = nvcc.parent() { + if let Some(root) = bin.parent() { + return Some(root.to_path_buf()); + } + } + } + + None +} + +fn get_cuda_lib_path() -> Option { + let root = get_cuda_root()?; + for sub in &["lib64", "lib/x64", "lib"] { + let p = root.join(sub); + if p.exists() { + return Some(p); + } + } + None +} + +fn find_nvcc() -> Option { + // Check well-known paths first to avoid PATH-injection + let candidates = [ + "/usr/local/cuda/bin/nvcc", + "/usr/cuda/bin/nvcc", + // Windows: CUDA_PATH is checked above; if we reach here, fall back to PATH + ]; + for c in &candidates { + let p = PathBuf::from(c); + if p.exists() { + return Some(p); + } + } + + // Last-resort: check that `nvcc` is runnable + let ok = std::process::Command::new("nvcc") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if ok { + return Some(PathBuf::from("nvcc")); // rely on PATH + } + + None +} diff --git a/hypura-sys/src/hypura_buft.c b/hypura-sys/src/hypura_buft.c index 04f11bd..d92aa2c 100644 --- a/hypura-sys/src/hypura_buft.c +++ b/hypura-sys/src/hypura_buft.c @@ -2,9 +2,36 @@ #include "ggml-backend-impl.h" #include #include -#include -/* ---------- Context structs ---------- */ +/* ── Platform-specific anonymous memory ──────────────────────────────────── */ + +#ifdef _WIN32 +# include + +static void *platform_alloc_pages(size_t size) { + return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); +} + +static void platform_free_pages(void *addr, size_t size) { + (void)size; + if (addr) VirtualFree(addr, 0, MEM_RELEASE); +} + +#else +# include + +static void *platform_alloc_pages(size_t size) { + void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + return (p == MAP_FAILED) ? NULL : p; +} + +static void platform_free_pages(void *addr, size_t size) { + if (addr) munmap(addr, size); +} +#endif + +/* ── Context structs ─────────────────────────────────────────────────────── */ typedef struct { hypura_on_tensor_loaded_t on_tensor_loaded; @@ -14,7 +41,7 @@ typedef struct { } hypura_buft_context; typedef struct { - void *base; /* original mmap'd buffer (loading phase) */ + void *base; /* anonymous page buffer (loading phase) */ size_t size; hypura_buft_context *buft_ctx; void *pool_base; /* pool buffer (inference phase, expert-streaming) */ @@ -22,16 +49,16 @@ typedef struct { int pool_active; /* 0 = loading phase, 1 = pool phase */ } hypura_buffer_context; -/* ---------- Buffer vtable ---------- */ +/* ── Buffer vtable ───────────────────────────────────────────────────────── */ static void hypura_buf_free(ggml_backend_buffer_t buffer) { hypura_buffer_context *ctx = (hypura_buffer_context *)buffer->context; if (ctx) { if (ctx->base && ctx->size > 0) { - munmap(ctx->base, ctx->size); + platform_free_pages(ctx->base, ctx->size); } if (ctx->pool_base && ctx->pool_size > 0) { - munmap(ctx->pool_base, ctx->pool_size); + platform_free_pages(ctx->pool_base, ctx->pool_size); } free(ctx); } @@ -95,7 +122,7 @@ static void hypura_buf_clear(ggml_backend_buffer_t buffer, uint8_t value) { (void)buffer; (void)value; } -/* ---------- Buffer type vtable ---------- */ +/* ── Buffer type vtable ──────────────────────────────────────────────────── */ static const char *hypura_buft_get_name(ggml_backend_buffer_type_t buft) { (void)buft; @@ -104,22 +131,19 @@ static const char *hypura_buft_get_name(ggml_backend_buffer_type_t buft) { static ggml_backend_buffer_t hypura_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { /* Page-align the allocation for direct I/O compatibility. - * Use mmap instead of posix_memalign: on macOS unified memory, mmap pages - * are lazily committed on first access. This avoids Metal OOM from a large - * virtual reservation — uncommitted mmap pages don't count against Metal's - * working set the same way heap allocations do. Pages loaded via pread get - * committed; released pages (MADV_FREE) return to uncommitted state. */ + * Use platform_alloc_pages (mmap/VirtualAlloc): pages are lazily committed + * on first access, so a large virtual reservation doesn't immediately + * consume physical memory or GPU working set. */ size_t aligned_size = (size + 4095) & ~(size_t)4095; - void *base = mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE, -1, 0); - if (base == MAP_FAILED) { + void *base = platform_alloc_pages(aligned_size); + if (!base) { return NULL; } hypura_buffer_context *buf_ctx = (hypura_buffer_context *)calloc(1, sizeof(hypura_buffer_context)); if (!buf_ctx) { - free(base); + platform_free_pages(base, aligned_size); return NULL; } buf_ctx->base = base; @@ -127,15 +151,15 @@ static ggml_backend_buffer_t hypura_buft_alloc_buffer(ggml_backend_buffer_type_t buf_ctx->buft_ctx = (hypura_buft_context *)buft->context; struct ggml_backend_buffer_i iface = { - .free_buffer = hypura_buf_free, - .get_base = hypura_buf_get_base, - .init_tensor = hypura_buf_init_tensor, + .free_buffer = hypura_buf_free, + .get_base = hypura_buf_get_base, + .init_tensor = hypura_buf_init_tensor, .memset_tensor = hypura_buf_memset_tensor, - .set_tensor = hypura_buf_set_tensor, - .get_tensor = hypura_buf_get_tensor, - .cpy_tensor = hypura_buf_cpy_tensor, - .clear = hypura_buf_clear, - .reset = NULL, + .set_tensor = hypura_buf_set_tensor, + .get_tensor = hypura_buf_get_tensor, + .cpy_tensor = hypura_buf_cpy_tensor, + .clear = hypura_buf_clear, + .reset = NULL, }; ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, iface, buf_ctx, aligned_size); @@ -162,7 +186,7 @@ static bool hypura_buft_is_host(ggml_backend_buffer_type_t buft) { return true; /* critical: CPU backend requires is_host=true */ } -/* ---------- Public API ---------- */ +/* ── Public API ──────────────────────────────────────────────────────────── */ ggml_backend_buffer_type_t hypura_buft_create( hypura_on_tensor_loaded_t on_tensor_loaded, @@ -207,7 +231,7 @@ void *hypura_buffer_get_base_ptr(ggml_backend_buffer_t buffer) { return ctx ? ctx->base : NULL; } -/* --- Pool buffer API --- */ +/* ── Pool buffer API ─────────────────────────────────────────────────────── */ int hypura_buffer_init_pool(ggml_backend_buffer_t buffer, size_t pool_size) { if (!buffer) return -1; @@ -215,9 +239,8 @@ int hypura_buffer_init_pool(ggml_backend_buffer_t buffer, size_t pool_size) { if (!ctx) return -1; size_t aligned = (pool_size + 4095) & ~(size_t)4095; - void *pool = mmap(NULL, aligned, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE, -1, 0); - if (pool == MAP_FAILED) return -1; + void *pool = platform_alloc_pages(aligned); + if (!pool) return -1; ctx->pool_base = pool; ctx->pool_size = aligned; @@ -230,7 +253,7 @@ void hypura_buffer_release_loading_buffer(ggml_backend_buffer_t buffer) { if (!ctx) return; if (ctx->base && ctx->size > 0) { - munmap(ctx->base, ctx->size); + platform_free_pages(ctx->base, ctx->size); ctx->base = NULL; ctx->size = 0; } diff --git a/memory/MEMORY.md b/memory/MEMORY.md new file mode 100644 index 0000000..ed27883 --- /dev/null +++ b/memory/MEMORY.md @@ -0,0 +1,9 @@ +# Hypura Project Memory + +This directory contains persistent memory files tracking significant work done on the Hypura project. + +## Index + +| File | Name | Description | +|------|------|-------------| +| [project_windows_port.md](./project_windows_port.md) | Windows/WSL2 + CUDA port | Cross-platform porting work — macOS/Metal → Windows/WSL2/CUDA | diff --git a/memory/project_windows_port.md b/memory/project_windows_port.md new file mode 100644 index 0000000..006a0ca --- /dev/null +++ b/memory/project_windows_port.md @@ -0,0 +1,41 @@ +--- +name: Windows/WSL2 + CUDA port +description: Cross-platform porting work — macOS/Metal → Windows/WSL2/CUDA +type: project +--- + +Completed cross-platform port of Hypura from macOS/Apple Silicon to Windows/WSL2 with CUDA (RTX 3060+) support. + +**Why:** User requested Windows/WSL2 support with RTX 3060 as the base GPU target. + +**How to apply:** When touching platform-specific code, follow the established patterns below. + +## Architecture: `src/io/compat.rs` +New platform abstraction module providing: +- `NativeFd` type alias (i32 on Unix, isize/HANDLE on Windows) +- `open_direct_fd(path)` — cache-bypass file open +- `close_fd(fd)` — close handle +- `read_at_fd(fd, dst, size, offset)` — positional read (pread / ReadFile) +- `alloc_pages(size)` / `free_pages(ptr, size)` — anonymous memory (mmap / VirtualAlloc) +- `advise_free_pages(ptr, size)` — MADV_FREE / MEM_DECOMMIT + +## Key changes made +- `hypura-sys/build.rs`: CUDA detection (CUDA_PATH, /usr/local/cuda, nvcc), sm_75;86;89;90 architectures +- `hypura-sys/src/hypura_buft.c`: `#ifdef _WIN32` VirtualAlloc/VirtualFree replacing mmap/munmap +- `src/profiler/cpu.rs`: sysctl on macOS only; sysinfo + /proc/cpuinfo on Linux; AVX2/AVX512 via is_x86_feature_detected! +- `src/profiler/gpu.rs`: Metal on macOS; CUDA backend + NVIDIA GPU spec DB (RTX 20/30/40/50, A/H series) +- `src/profiler/storage.rs`: mount point detection per-platform; F_NOCACHE on macOS, posix_fadvise on Linux, std::io on Windows +- `src/profiler/mod.rs`: APPDATA on Windows, ~/.hypura elsewhere; cross-platform os_version/machine_model +- `src/compute/inference.rs`: total_physical_memory() uses sysinfo on non-macOS +- `src/compute/nvme_backend.rs`: all libc I/O replaced with compat module +- `src/io/aligned_buffer.rs`: rewritten with std::alloc::Layout (works everywhere) +- `src/io/async_reader.rs`: rewritten with compat module +- `src/cli/iobench.rs`: rewritten with compat module +- `src/scheduler/placement.rs`: OS_OVERHEAD: macOS=2GB, Windows=4GB, Linux=1GB; GPU_RUNTIME_OVERHEAD: macOS=1GB, others=512MB +- `Cargo.toml`: windows-sys 0.59 added as conditional Windows dependency + +## CUDA architectures targeted +sm_75 (RTX 20xx), sm_86 (RTX 3060 base target), sm_89 (RTX 40xx), sm_90 (H100) +Override via env: HYPURA_CUDA_ARCHITECTURES="75;86;89;90" +Disable CUDA: HYPURA_NO_CUDA=1 +Force CUDA: HYPURA_CUDA=1 diff --git a/src/cli/iobench.rs b/src/cli/iobench.rs index 06a4874..71dcf36 100644 --- a/src/cli/iobench.rs +++ b/src/cli/iobench.rs @@ -1,12 +1,12 @@ -use std::ffi::c_void; -use std::os::unix::io::AsRawFd; use std::path::Path; use std::sync::{Arc, Barrier}; use std::time::Instant; +use hypura::io::aligned_buffer::AlignedBuffer; +use hypura::io::compat::{self, NativeFd}; use hypura::model::gguf::GgufFile; -const BLOCK_SIZE: usize = 4 * 1024 * 1024; // 4 MiB pread chunks (matches typical tensor size) +const BLOCK_SIZE: usize = 4 * 1024 * 1024; // 4 MiB read chunks (matches typical tensor size) const PAGE_SIZE: usize = 4096; pub fn run(model_path: &str, read_gb: f64) -> anyhow::Result<()> { @@ -32,13 +32,13 @@ pub fn run(model_path: &str, read_gb: f64) -> anyhow::Result<()> { ); println!(); - // Run F_NOCACHE variants first to avoid page cache contamination from variant A. + // Run cache-bypass variants first to avoid page cache contamination from variant A. let bw_b = test_nocache_sequential(path, data_start, test_bytes)?; - let bw_c = test_nocache_madvfree_cycle(path, data_start, test_bytes)?; + let bw_c = test_nocache_advfree_cycle(path, data_start, test_bytes)?; let bw_d2 = test_mt_nocache(path, data_start, test_bytes, 2)?; let bw_d4 = test_mt_nocache(path, data_start, test_bytes, 4)?; - let bw_e2 = test_mt_nocache_madvfree(path, data_start, test_bytes, 2)?; - let bw_e4 = test_mt_nocache_madvfree(path, data_start, test_bytes, 4)?; + let bw_e2 = test_mt_nocache_advfree(path, data_start, test_bytes, 2)?; + let bw_e4 = test_mt_nocache_advfree(path, data_start, test_bytes, 4)?; let bw_f = test_scattered_reads(path, &gguf, test_bytes)?; // Variant A last (populates page cache) let bw_a = test_raw_sequential(path, data_start, test_bytes)?; @@ -48,7 +48,7 @@ pub fn run(model_path: &str, read_gb: f64) -> anyhow::Result<()> { let fmt = |label: &str, bw: f64| { let pct = (bw / bw_a - 1.0) * 100.0; let sign = if pct >= 0.0 { "+" } else { "" }; - if (pct.abs()) < 0.5 { + if pct.abs() < 0.5 { println!(" {label:<42} {:.2} GB/s", bw / 1e9); } else { println!( @@ -59,35 +59,35 @@ pub fn run(model_path: &str, read_gb: f64) -> anyhow::Result<()> { } }; - fmt("A. Raw sequential pread (baseline)", bw_a); - fmt("B. pread + F_NOCACHE", bw_b); + fmt("A. Raw sequential read (baseline)", bw_a); + fmt("B. Cache-bypass sequential read", bw_b); println!(); - println!(" C. F_NOCACHE + MADV_FREE cycle:"); + println!(" C. Cache-bypass + advise-free cycle:"); for (i, &bw) in bw_c.iter().enumerate() { fmt(&format!(" Pass {} (re-read after release)", i + 1), bw); } println!(); - fmt("D. Multi-threaded F_NOCACHE (2 threads)", bw_d2); - fmt(" Multi-threaded F_NOCACHE (4 threads)", bw_d4); + fmt("D. Multi-threaded cache-bypass (2 threads)", bw_d2); + fmt(" Multi-threaded cache-bypass (4 threads)", bw_d4); println!(); - fmt("E. MT + MADV_FREE (2 threads)", bw_e2); - fmt(" MT + MADV_FREE (4 threads)", bw_e4); + fmt("E. MT + advise-free (2 threads)", bw_e2); + fmt(" MT + advise-free (4 threads)", bw_e4); println!(); fmt("F. Scattered per-tensor reads", bw_f); // Diagnosis println!(); let nocache_impact = (1.0 - bw_b / bw_a) * 100.0; - let madvfree_impact = (1.0 - bw_c[0] / bw_b) * 100.0; + let advfree_impact = (1.0 - bw_c[0] / bw_b) * 100.0; let scatter_impact = (1.0 - bw_f / bw_b) * 100.0; let mt_gain = (bw_d4 / bw_b - 1.0) * 100.0; println!(" Diagnosis:"); - if madvfree_impact > 30.0 { - println!(" >> MADV_FREE re-fault is a major bottleneck ({madvfree_impact:.0}% throughput loss)"); + if advfree_impact > 30.0 { + println!(" >> Page-release re-fault is a major bottleneck ({advfree_impact:.0}% throughput loss)"); } if nocache_impact > 20.0 { - println!(" >> F_NOCACHE is a significant bottleneck ({nocache_impact:.0}% throughput loss)"); + println!(" >> Cache-bypass is a significant bottleneck ({nocache_impact:.0}% throughput loss)"); } if scatter_impact > 30.0 { println!(" >> Per-tensor scattered reads cost {scatter_impact:.0}% throughput vs sequential"); @@ -102,163 +102,122 @@ pub fn run(model_path: &str, read_gb: f64) -> anyhow::Result<()> { Ok(()) } -/// Allocate a page-aligned buffer via posix_memalign. -fn alloc_aligned(size: usize) -> *mut u8 { - let mut ptr: *mut c_void = std::ptr::null_mut(); - unsafe { - libc::posix_memalign(&mut ptr, PAGE_SIZE, size); - } - ptr as *mut u8 -} +// ── Low-level helpers ───────────────────────────────────────────────────────── /// Read `size` bytes from `fd` at `file_offset` into `dst`. Handles partial reads. -fn pread_full(fd: i32, dst: *mut u8, size: usize, file_offset: u64) { - let mut read = 0usize; - while read < size { - let n = unsafe { - libc::pread( - fd, - dst.add(read) as *mut c_void, - size - read, - (file_offset + read as u64) as libc::off_t, - ) - }; - if n <= 0 { - break; - } - read += n as usize; +fn read_full(fd: NativeFd, dst: *mut u8, size: usize, file_offset: u64) { + let mut done = 0usize; + while done < size { + let n = compat::read_at_fd(fd, unsafe { dst.add(done) }, size - done, file_offset + done as u64); + if n <= 0 { break; } + done += n as usize; } } -/// Sequential pread through a region in BLOCK_SIZE chunks. Returns bytes read. -fn pread_sequential(fd: i32, buf: *mut u8, file_start: u64, total: usize) -> usize { - let mut offset = 0; - while offset < total { - let chunk = BLOCK_SIZE.min(total - offset); - pread_full(fd, unsafe { buf.add(offset) }, chunk, file_start + offset as u64); - offset += chunk; - } - offset -} +// ── Variant A: raw sequential read (baseline) ───────────────────────────────── -/// Variant A: raw sequential pread, no F_NOCACHE, no MADV_FREE. fn test_raw_sequential(path: &Path, data_start: u64, test_bytes: usize) -> anyhow::Result { - let file = std::fs::File::open(path)?; - let fd = file.as_raw_fd(); - let buf = alloc_aligned(BLOCK_SIZE); + use std::io::{Read, Seek, SeekFrom}; + let mut file = std::fs::File::open(path)?; + file.seek(SeekFrom::Start(data_start))?; + let mut buf = vec![0u8; BLOCK_SIZE]; // Warmup - pread_sequential(fd, buf, data_start, BLOCK_SIZE.min(test_bytes)); + let _ = file.read(&mut buf[..BLOCK_SIZE.min(test_bytes)]); + file.seek(SeekFrom::Start(data_start))?; let start = Instant::now(); let mut total = 0usize; - let mut off = 0usize; - while off < test_bytes { - let chunk = BLOCK_SIZE.min(test_bytes - off); - pread_full(fd, buf, chunk, data_start + off as u64); - total += chunk; - off += chunk; + while total < test_bytes { + let chunk = BLOCK_SIZE.min(test_bytes - total); + let n = file.read(&mut buf[..chunk])?; + if n == 0 { break; } + total += n; } let elapsed = start.elapsed().as_secs_f64(); - - unsafe { libc::free(buf as *mut c_void) }; Ok(total as f64 / elapsed) } -/// Variant B: pread + F_NOCACHE. -fn test_nocache_sequential( - path: &Path, - data_start: u64, - test_bytes: usize, -) -> anyhow::Result { - let file = std::fs::File::open(path)?; - let fd = file.as_raw_fd(); - unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - let buf = alloc_aligned(BLOCK_SIZE); +// ── Variant B: cache-bypass sequential read ─────────────────────────────────── + +fn test_nocache_sequential(path: &Path, data_start: u64, test_bytes: usize) -> anyhow::Result { + let fd = compat::open_direct_fd(path)?; + let mut buf = AlignedBuffer::new(BLOCK_SIZE, PAGE_SIZE)?; let start = Instant::now(); let mut total = 0usize; let mut off = 0usize; while off < test_bytes { let chunk = BLOCK_SIZE.min(test_bytes - off); - pread_full(fd, buf, chunk, data_start + off as u64); + read_full(fd, buf.as_mut_ptr(), chunk, data_start + off as u64); total += chunk; off += chunk; } let elapsed = start.elapsed().as_secs_f64(); - - unsafe { libc::free(buf as *mut c_void) }; + compat::close_fd(fd); Ok(total as f64 / elapsed) } -/// Variant C: F_NOCACHE + MADV_FREE cycle (3 passes). -/// Returns throughput for each re-read pass. -fn test_nocache_madvfree_cycle( +// ── Variant C: cache-bypass + advise-free cycle ─────────────────────────────── + +fn test_nocache_advfree_cycle( path: &Path, data_start: u64, test_bytes: usize, ) -> anyhow::Result> { - let file = std::fs::File::open(path)?; - let fd = file.as_raw_fd(); - unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; + let fd = compat::open_direct_fd(path)?; - // Full-size buffer (like Hypura's NVMe buffer) - let buf = alloc_aligned(test_bytes); + // Allocate a full-size buffer (like Hypura's NVMe buffer) + let buf_ptr = compat::alloc_pages(test_bytes); + anyhow::ensure!(!buf_ptr.is_null(), "alloc_pages failed for {test_bytes} bytes"); // Prime: initial read to commit pages - pread_sequential(fd, buf, data_start, test_bytes); + let mut off = 0usize; + while off < test_bytes { + let chunk = BLOCK_SIZE.min(test_bytes - off); + read_full(fd, unsafe { buf_ptr.add(off) }, chunk, data_start + off as u64); + off += chunk; + } let mut results = Vec::new(); - for _ in 0..3 { - // Release pages (matches release_layer in nvme_backend.rs) - unsafe { - libc::madvise(buf as *mut c_void, test_bytes, libc::MADV_FREE); - } - - // Force page reclaim: allocate + touch a pressure buffer - let pressure_size = 8usize << 30; // 8 GB - let pressure = alloc_aligned(pressure_size); - if !pressure.is_null() { - // Touch every page to force OS to reclaim MADV_FREE pages - for i in (0..pressure_size).step_by(PAGE_SIZE) { - unsafe { *pressure.add(i) = 1 }; - } - unsafe { libc::free(pressure as *mut c_void) }; - } + // Release pages back to OS (matches release_layer in nvme_backend.rs) + compat::advise_free_pages(buf_ptr, test_bytes); // Re-read (timed) let start = Instant::now(); - pread_sequential(fd, buf, data_start, test_bytes); + let mut off2 = 0usize; + while off2 < test_bytes { + let chunk = BLOCK_SIZE.min(test_bytes - off2); + read_full(fd, unsafe { buf_ptr.add(off2) }, chunk, data_start + off2 as u64); + off2 += chunk; + } let elapsed = start.elapsed().as_secs_f64(); - results.push(test_bytes as f64 / elapsed); } - unsafe { libc::free(buf as *mut c_void) }; + compat::free_pages(buf_ptr, test_bytes); + compat::close_fd(fd); Ok(results) } -/// Variant D: multi-threaded pread with F_NOCACHE. +// ── Variant D: multi-threaded cache-bypass ──────────────────────────────────── + fn test_mt_nocache( path: &Path, data_start: u64, test_bytes: usize, num_threads: usize, ) -> anyhow::Result { - let buf = alloc_aligned(test_bytes); - let buf_addr = buf as usize; // safe to send across threads + let buf_ptr = compat::alloc_pages(test_bytes); + anyhow::ensure!(!buf_ptr.is_null(), "alloc_pages failed"); + let buf_addr = buf_ptr as usize; let barrier = Arc::new(Barrier::new(num_threads + 1)); - let chunk_per_thread = (test_bytes + num_threads - 1) / num_threads; let handles: Vec<_> = (0..num_threads) .map(|i| { - let file = std::fs::File::open(path).unwrap(); - let fd = file.as_raw_fd(); - unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - std::mem::forget(file); - + let fd = compat::open_direct_fd(path).expect("open_direct_fd failed"); let barrier = barrier.clone(); let start_off = i * chunk_per_thread; let end_off = (start_off + chunk_per_thread).min(test_bytes); @@ -267,67 +226,60 @@ fn test_mt_nocache( std::thread::spawn(move || { barrier.wait(); let my_buf = (buf_addr + start_off) as *mut u8; - pread_sequential(fd, my_buf, data_start + start_off as u64, thread_bytes); - unsafe { libc::close(fd) }; + let mut off = 0usize; + while off < thread_bytes { + let chunk = BLOCK_SIZE.min(thread_bytes - off); + read_full(fd, unsafe { my_buf.add(off) }, chunk, data_start + (start_off + off) as u64); + off += chunk; + } + compat::close_fd(fd); }) }) .collect(); barrier.wait(); let start = Instant::now(); - for h in handles { - h.join().unwrap(); - } + for h in handles { h.join().unwrap(); } let elapsed = start.elapsed().as_secs_f64(); - unsafe { libc::free(buf as *mut c_void) }; + compat::free_pages(buf_ptr, test_bytes); Ok(test_bytes as f64 / elapsed) } -/// Variant E: multi-threaded + MADV_FREE cycle. -fn test_mt_nocache_madvfree( +// ── Variant E: multi-threaded + advise-free ─────────────────────────────────── + +fn test_mt_nocache_advfree( path: &Path, data_start: u64, test_bytes: usize, num_threads: usize, ) -> anyhow::Result { - let buf = alloc_aligned(test_bytes); + let buf_ptr = compat::alloc_pages(test_bytes); + anyhow::ensure!(!buf_ptr.is_null(), "alloc_pages failed"); // Prime { - let file = std::fs::File::open(path)?; - let fd = file.as_raw_fd(); - unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - pread_sequential(fd, buf, data_start, test_bytes); + let fd = compat::open_direct_fd(path)?; + let mut off = 0usize; + while off < test_bytes { + let chunk = BLOCK_SIZE.min(test_bytes - off); + read_full(fd, unsafe { buf_ptr.add(off) }, chunk, data_start + off as u64); + off += chunk; + } + compat::close_fd(fd); } // Release - unsafe { - libc::madvise(buf as *mut c_void, test_bytes, libc::MADV_FREE); - } - - // Pressure to force reclaim - let pressure_size = 8usize << 30; - let pressure = alloc_aligned(pressure_size); - if !pressure.is_null() { - for i in (0..pressure_size).step_by(PAGE_SIZE) { - unsafe { *pressure.add(i) = 1 }; - } - unsafe { libc::free(pressure as *mut c_void) }; - } + compat::advise_free_pages(buf_ptr, test_bytes); // Multi-threaded re-read - let buf_addr = buf as usize; + let buf_addr = buf_ptr as usize; let barrier = Arc::new(Barrier::new(num_threads + 1)); let chunk_per_thread = (test_bytes + num_threads - 1) / num_threads; let handles: Vec<_> = (0..num_threads) .map(|i| { - let file = std::fs::File::open(path).unwrap(); - let fd = file.as_raw_fd(); - unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - std::mem::forget(file); - + let fd = compat::open_direct_fd(path).expect("open_direct_fd failed"); let barrier = barrier.clone(); let start_off = i * chunk_per_thread; let end_off = (start_off + chunk_per_thread).min(test_bytes); @@ -336,35 +288,36 @@ fn test_mt_nocache_madvfree( std::thread::spawn(move || { barrier.wait(); let my_buf = (buf_addr + start_off) as *mut u8; - pread_sequential(fd, my_buf, data_start + start_off as u64, thread_bytes); - unsafe { libc::close(fd) }; + let mut off = 0usize; + while off < thread_bytes { + let chunk = BLOCK_SIZE.min(thread_bytes - off); + read_full(fd, unsafe { my_buf.add(off) }, chunk, data_start + (start_off + off) as u64); + off += chunk; + } + compat::close_fd(fd); }) }) .collect(); barrier.wait(); let start = Instant::now(); - for h in handles { - h.join().unwrap(); - } + for h in handles { h.join().unwrap(); } let elapsed = start.elapsed().as_secs_f64(); - unsafe { libc::free(buf as *mut c_void) }; + compat::free_pages(buf_ptr, test_bytes); Ok(test_bytes as f64 / elapsed) } -/// Variant F: scattered per-tensor reads (simulates Hypura's actual pattern). +// ── Variant F: scattered per-tensor reads ──────────────────────────────────── + fn test_scattered_reads( path: &Path, gguf: &GgufFile, max_bytes: usize, ) -> anyhow::Result { - let file = std::fs::File::open(path)?; - let fd = file.as_raw_fd(); - unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - let buf = alloc_aligned(BLOCK_SIZE); + let fd = compat::open_direct_fd(path)?; + let mut buf = AlignedBuffer::new(BLOCK_SIZE, PAGE_SIZE)?; - // Build list of (file_offset, size) from GGUF tensors let mut regions: Vec<(u64, usize)> = gguf .tensors .iter() @@ -376,16 +329,13 @@ fn test_scattered_reads( let mut total = 0usize; for &(file_off, size) in ®ions { - if total + size > max_bytes { - break; - } - // Read into the same buffer (we don't care about the data, just I/O throughput) + if total + size > max_bytes { break; } let read_size = size.min(BLOCK_SIZE); - pread_full(fd, buf, read_size, file_off); + read_full(fd, buf.as_mut_ptr(), read_size, file_off); total += read_size; } let elapsed = start.elapsed().as_secs_f64(); - unsafe { libc::free(buf as *mut c_void) }; + compat::close_fd(fd); Ok(total as f64 / elapsed) } diff --git a/src/compute/inference.rs b/src/compute/inference.rs index 14b1071..441d89e 100644 --- a/src/compute/inference.rs +++ b/src/compute/inference.rs @@ -1115,31 +1115,56 @@ pub fn generate_with_nvme_scheduling( }) } -/// Query total physical RAM via sysctl (macOS). +/// Query total physical RAM. +/// +/// macOS: `hw.memsize` sysctl. +/// Linux/Windows: `sysinfo` crate (no privileged API needed). fn total_physical_memory() -> u64 { - unsafe { - let mut size: u64 = 0; - let mut len = std::mem::size_of::(); - let name = b"hw.memsize\0"; - libc::sysctlbyname( - name.as_ptr() as *const i8, - &mut size as *mut u64 as *mut libc::c_void, - &mut len as *mut usize, - std::ptr::null_mut(), - 0, - ); - if size == 0 { 32 * (1 << 30) } else { size } + #[cfg(target_os = "macos")] + { + let total = unsafe { + let mut size: u64 = 0; + let mut len = std::mem::size_of::(); + let name = b"hw.memsize\0"; + libc::sysctlbyname( + name.as_ptr() as *const i8, + &mut size as *mut u64 as *mut libc::c_void, + &mut len as *mut usize, + std::ptr::null_mut(), + 0, + ); + size + }; + if total == 0 { 32 * (1 << 30) } else { total } + } + #[cfg(not(target_os = "macos"))] + { + let mut sys = sysinfo::System::new(); + sys.refresh_memory(); + let total = sys.total_memory(); + if total == 0 { 16 * (1 << 30) } else { total } } } fn num_performance_cores() -> i32 { - crate::profiler::cpu::sysctl_u32("hw.perflevel0.logicalcpu") - .map(|n| n as i32) - .unwrap_or_else(|_| { - std::thread::available_parallelism() - .map(|n| (n.get() / 2).max(1) as i32) - .unwrap_or(4) - }) + // macOS: use hw.perflevel0 (P-cores only) + #[cfg(target_os = "macos")] + { + crate::profiler::cpu::sysctl_u32("hw.perflevel0.logicalcpu") + .map(|n| n as i32) + .unwrap_or_else(|_| { + std::thread::available_parallelism() + .map(|n| (n.get() / 2).max(1) as i32) + .unwrap_or(4) + }) + } + // Non-macOS: use half of logical CPUs as a conservative estimate for I/O threads + #[cfg(not(target_os = "macos"))] + { + std::thread::available_parallelism() + .map(|n| (n.get() / 2).max(1) as i32) + .unwrap_or(4) + } } #[cfg(test)] diff --git a/src/compute/nvme_backend.rs b/src/compute/nvme_backend.rs index f512ad5..7fcdb1b 100644 --- a/src/compute/nvme_backend.rs +++ b/src/compute/nvme_backend.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::ffi::{c_void, CStr, CString}; -use std::os::unix::io::AsRawFd; +use crate::io::compat::{self, NativeFd}; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Condvar, Mutex}; @@ -74,8 +74,8 @@ pub struct IoPool { tx: Option>, /// Worker thread handles. handles: Vec>, - /// Per-worker file descriptors (for cleanup). - worker_fds: Vec, + /// Per-worker file descriptors / handles (for cleanup). + worker_fds: Vec, /// Throughput tracking: total bytes loaded by all workers. pub bytes_loaded: Arc, /// Throughput tracking: total load time in nanoseconds. @@ -98,12 +98,7 @@ impl IoPool { let mut handles = Vec::with_capacity(num_workers); for i in 0..num_workers { - let file = std::fs::File::open(model_path)?; - let fd = file.as_raw_fd(); - unsafe { - libc::fcntl(fd, libc::F_NOCACHE, 1); - } - std::mem::forget(file); + let fd = compat::open_direct_fd(model_path)?; worker_fds.push(fd); let rx = rx.clone(); @@ -155,18 +150,16 @@ impl Drop for IoPool { let _ = handle.join(); } - // Close per-worker fds + // Close per-worker fds / handles for fd in &self.worker_fds { - unsafe { - libc::close(*fd); - } + compat::close_fd(*fd); } } } -/// I/O worker thread: pulls tasks from shared channel, executes pread. +/// I/O worker thread: pulls tasks from shared channel, executes pread / ReadFile. fn io_worker( - fd: i32, + fd: NativeFd, rx: Arc>>, state: Arc, bytes_loaded: Arc, @@ -213,18 +206,13 @@ fn io_worker( } } -/// Perform pread I/O for a single region. Standalone function used by IoPool workers. -fn pread_region(fd: i32, base: *mut u8, offset: usize, size: usize, file_offset: u64) { +/// Perform positional I/O for a single region (pread on Unix, ReadFile on Windows). +fn pread_region(fd: NativeFd, base: *mut u8, offset: usize, size: usize, file_offset: u64) { let dst = unsafe { base.add(offset) }; let mut read = 0usize; while read < size { - let n = unsafe { - libc::pread( - fd, - dst.add(read) as *mut c_void, - size - read, - (file_offset + read as u64) as libc::off_t, - ) + let n = { + compat::read_at_fd(fd, unsafe { dst.add(read) }, size - read, file_offset + read as u64) }; if n <= 0 { break; @@ -611,9 +599,7 @@ impl PrefetchState { for &(offset, size, _) in regions { let ptr = unsafe { base.add(offset) }; - unsafe { - libc::madvise(ptr as *mut c_void, size, libc::MADV_FREE); - } + compat::advise_free_pages(ptr, size); } // Invalidate neuron cache entries for this layer @@ -1006,13 +992,7 @@ impl PrefetchState { { let offset = ev_layout.expert_buffer_offset(ev_e); let ptr = unsafe { base.add(offset) }; - unsafe { - libc::madvise( - ptr as *mut c_void, - ev_layout.expert_stride, - libc::MADV_FREE, - ); - } + compat::advise_free_pages(ptr, ev_layout.expert_stride); } } } @@ -1255,25 +1235,15 @@ impl PrefetchState { return None; } - // Allocate resident buffer - let base = unsafe { - libc::mmap( - std::ptr::null_mut(), - total_size, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_ANON | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if base == libc::MAP_FAILED { + // Allocate resident buffer (anonymous pages, lazily committed) + let base = compat::alloc_pages(total_size); + if base.is_null() { tracing::warn!( "Failed to allocate resident FFN buffer ({:.1} GB)", total_size as f64 / 1e9, ); return None; } - let base = base as *mut u8; tracing::info!( "Resident FFN buffer: {:.1} GB for {} layers (layers 0-{})", @@ -1644,12 +1614,7 @@ impl Drop for PrefetchState { } // Release resident FFN buffer if !self.resident_ffn_base.is_null() && self.resident_ffn_size > 0 { - unsafe { - libc::munmap( - self.resident_ffn_base as *mut libc::c_void, - self.resident_ffn_size, - ); - } + compat::free_pages(self.resident_ffn_base, self.resident_ffn_size); } } } @@ -1717,14 +1682,11 @@ impl ExpertPool { pub fn release_layer(&mut self, layer_idx: u32) { if let Some(slots) = self.layer_slots.remove(&layer_idx) { for slot in slots { - // MADV_FREE the slot's pages - unsafe { - libc::madvise( - self.pool_base.add(slot * self.slot_size) as *mut c_void, - self.slot_size, - libc::MADV_FREE, - ); - } + // Release the slot's physical pages back to the OS + compat::advise_free_pages( + unsafe { self.pool_base.add(slot * self.slot_size) }, + self.slot_size, + ); self.free_slots.push(slot); } } @@ -1820,22 +1782,13 @@ impl HypuraBuftController { } let aligned = (max_ffn_size + 4095) & !4095; - let ptr = unsafe { - libc::mmap( - std::ptr::null_mut(), - aligned, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_ANON | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ptr == libc::MAP_FAILED { + let ptr = compat::alloc_pages(aligned); + if ptr.is_null() { tracing::warn!("Failed to allocate FFN loading scratch ({aligned} bytes)"); return; } - *self.loading_scratch.lock().unwrap() = ptr as *mut u8; + *self.loading_scratch.lock().unwrap() = ptr; self.loading_scratch_size .store(aligned, Ordering::Relaxed); tracing::info!( @@ -1858,7 +1811,7 @@ impl HypuraBuftController { let mut scratch = self.loading_scratch.lock().unwrap(); let size = self.loading_scratch_size.swap(0, Ordering::Relaxed); if !scratch.is_null() && size > 0 { - unsafe { libc::munmap(*scratch as *mut libc::c_void, size) }; + compat::free_pages(*scratch, size); *scratch = std::ptr::null_mut(); } } diff --git a/src/io/aligned_buffer.rs b/src/io/aligned_buffer.rs index 4ede922..6081d52 100644 --- a/src/io/aligned_buffer.rs +++ b/src/io/aligned_buffer.rs @@ -1,33 +1,36 @@ +use std::alloc::{Layout, alloc, dealloc}; use std::ops::{Deref, DerefMut}; -/// Page-aligned buffer for direct I/O (F_NOCACHE). -/// Allocated via `posix_memalign`, freed via `libc::free`. +/// Page-aligned buffer for direct I/O. +/// +/// Uses Rust's global allocator (`std::alloc`) with an explicit alignment, +/// which works on macOS, Linux, and Windows without POSIX-specific APIs. pub struct AlignedBuffer { ptr: *mut u8, len: usize, + layout: Layout, } unsafe impl Send for AlignedBuffer {} unsafe impl Sync for AlignedBuffer {} impl AlignedBuffer { - /// Allocate `len` bytes aligned to `alignment` (must be a power of 2, typically 4096). + /// Allocate `len` bytes aligned to `alignment` (must be a power of 2, ≥ 1). pub fn new(len: usize, alignment: usize) -> std::io::Result { if len == 0 { return Ok(Self { ptr: std::ptr::null_mut(), len: 0, + layout: Layout::new::(), }); } - let mut ptr: *mut libc::c_void = std::ptr::null_mut(); - let ret = unsafe { libc::posix_memalign(&mut ptr, alignment, len) }; - if ret != 0 { - return Err(std::io::Error::from_raw_os_error(ret)); + let layout = Layout::from_size_align(len, alignment) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?; + let ptr = unsafe { alloc(layout) }; + if ptr.is_null() { + return Err(std::io::Error::from(std::io::ErrorKind::OutOfMemory)); } - Ok(Self { - ptr: ptr as *mut u8, - len, - }) + Ok(Self { ptr, len, layout }) } pub fn len(&self) -> usize { @@ -63,7 +66,7 @@ impl DerefMut for AlignedBuffer { impl Drop for AlignedBuffer { fn drop(&mut self) { if !self.ptr.is_null() { - unsafe { libc::free(self.ptr as *mut libc::c_void) } + unsafe { dealloc(self.ptr, self.layout) } } } } diff --git a/src/io/async_reader.rs b/src/io/async_reader.rs index 7f085cb..800326a 100644 --- a/src/io/async_reader.rs +++ b/src/io/async_reader.rs @@ -1,4 +1,3 @@ -use std::os::unix::io::AsRawFd; use std::path::{Path, PathBuf}; use std::time::Instant; @@ -6,6 +5,7 @@ use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; use crate::io::aligned_buffer::AlignedBuffer; +use crate::io::compat::{self, NativeFd}; /// A request to read a region from disk. #[derive(Debug, Clone)] @@ -30,7 +30,7 @@ pub struct ReadResult { pub stats: ReadStats, } -/// Double-buffered async disk reader with F_NOCACHE. +/// Double-buffered async disk reader with cache bypass. /// /// Spawns a background task that reads into alternating aligned buffers. /// Consumer receives completed reads via an mpsc channel. @@ -42,7 +42,7 @@ pub struct NvmePrefetcher { } impl NvmePrefetcher { - /// Open a file for prefetching with F_NOCACHE. + /// Open a file for prefetching with OS cache bypass. /// `buffer_size` is the size of each double-buffer (e.g., 4 MiB). pub fn open(path: impl AsRef, buffer_size: usize) -> std::io::Result { let file_path = path.as_ref().to_path_buf(); @@ -52,21 +52,17 @@ impl NvmePrefetcher { let path_clone = file_path.clone(); let handle = tokio::task::spawn_blocking(move || { - let file = match std::fs::File::open(&path_clone) { + let fd: NativeFd = match compat::open_direct_fd(&path_clone) { Ok(f) => f, Err(e) => { - tracing::error!("NvmePrefetcher: failed to open {}: {e}", path_clone.display()); + tracing::error!( + "NvmePrefetcher: failed to open {}: {e}", + path_clone.display() + ); return; } }; - let fd = file.as_raw_fd(); - - // Bypass filesystem cache - unsafe { - libc::fcntl(fd, libc::F_NOCACHE, 1); - } - while let Some(req) = request_rx.blocking_recv() { let mut buf = match AlignedBuffer::new(buffer_size.max(req.length), 4096) { Ok(b) => b, @@ -81,14 +77,12 @@ impl NvmePrefetcher { let to_read = req.length; while total_read < to_read { - let n = unsafe { - libc::pread( - fd, - buf[total_read..].as_mut_ptr() as *mut libc::c_void, - to_read - total_read, - (req.offset + total_read as u64) as libc::off_t, - ) - }; + let n = compat::read_at_fd( + fd, + buf[total_read..].as_mut_ptr(), + to_read - total_read, + req.offset + total_read as u64, + ); if n <= 0 { break; } @@ -117,6 +111,8 @@ impl NvmePrefetcher { break; // Consumer dropped } } + + compat::close_fd(fd); }); Ok(Self { @@ -139,37 +135,3 @@ impl NvmePrefetcher { self.result_rx.recv().await } } - -#[cfg(test)] -mod tests { - use super::*; - use std::io::Write; - - #[tokio::test] - async fn test_prefetch_read() { - // Create a temp file with known data - let dir = tempfile::tempdir().unwrap(); - let path = dir.path().join("test_data.bin"); - { - let mut f = std::fs::File::create(&path).unwrap(); - let data = vec![0xA5u8; 1 << 20]; // 1 MiB - f.write_all(&data).unwrap(); - f.sync_all().unwrap(); - } - - let mut prefetcher = NvmePrefetcher::open(&path, 1 << 20).unwrap(); - - prefetcher - .submit(ReadRequest { - offset: 0, - length: 4096, - tag: "test".into(), - }) - .unwrap(); - - let result = prefetcher.recv().await.unwrap(); - assert_eq!(result.stats.bytes_read, 4096); - assert_eq!(result.data[0], 0xA5); - assert!(result.stats.throughput_mbps > 0.0); - } -} diff --git a/src/io/compat.rs b/src/io/compat.rs new file mode 100644 index 0000000..2660bbd --- /dev/null +++ b/src/io/compat.rs @@ -0,0 +1,225 @@ +/// Cross-platform I/O primitives for the NVMe streaming backend. +/// +/// Provides: +/// - `NativeFd` — file descriptor (Unix) or HANDLE (Windows) +/// - `open_direct_fd` — open with OS-cache bypass +/// - `close_fd` — close native handle +/// - `read_at_fd` — positional read (pread on Unix, ReadFile on Windows) +/// - `alloc_pages` / `free_pages` — anonymous page-backed memory + +// ───────────────────────────────────────────────────────────── +// Type alias +// ───────────────────────────────────────────────────────────── + +/// Unix: raw file descriptor (i32). +/// Windows: Win32 HANDLE stored as isize. +#[cfg(unix)] +pub type NativeFd = i32; + +#[cfg(windows)] +pub type NativeFd = isize; + +// ───────────────────────────────────────────────────────────── +// Unix implementation +// ───────────────────────────────────────────────────────────── + +#[cfg(unix)] +mod imp { + use super::NativeFd; + use std::os::unix::io::IntoRawFd; + + /// Open `path` for direct (cache-bypass) sequential reads. + /// + /// - macOS: `F_NOCACHE` via `fcntl` + /// - Linux: `POSIX_FADV_DONTNEED` advisory + `POSIX_FADV_SEQUENTIAL` + pub fn open_direct_fd(path: &std::path::Path) -> std::io::Result { + let file = std::fs::File::open(path)?; + let fd = file.into_raw_fd(); + + #[cfg(target_os = "macos")] + unsafe { + // Disable the unified buffer cache for this fd. + libc::fcntl(fd, libc::F_NOCACHE, 1); + } + + #[cfg(target_os = "linux")] + unsafe { + // Ask the kernel to drop cached pages after reads (best-effort). + libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_DONTNEED); + libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_SEQUENTIAL); + } + + Ok(fd) + } + + pub fn close_fd(fd: NativeFd) { + unsafe { + libc::close(fd); + } + } + + /// Positional read — equivalent to `pread(2)`. + /// Returns bytes read (≥0) or -1 on error. + pub fn read_at_fd(fd: NativeFd, dst: *mut u8, size: usize, file_offset: u64) -> isize { + unsafe { + libc::pread( + fd, + dst as *mut libc::c_void, + size, + file_offset as libc::off_t, + ) + } + } + + /// Allocate `size` bytes of anonymous, lazily-committed memory. + /// Returns null on failure. + pub fn alloc_pages(size: usize) -> *mut u8 { + let p = unsafe { + libc::mmap( + std::ptr::null_mut(), + size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_ANON | libc::MAP_PRIVATE, + -1, + 0, + ) + }; + if p == libc::MAP_FAILED { + std::ptr::null_mut() + } else { + p as *mut u8 + } + } + + pub fn free_pages(ptr: *mut u8, size: usize) { + if !ptr.is_null() { + unsafe { + libc::munmap(ptr as *mut libc::c_void, size); + } + } + } +} + +// ───────────────────────────────────────────────────────────── +// Windows implementation +// ───────────────────────────────────────────────────────────── + +#[cfg(windows)] +mod imp { + use super::NativeFd; + use std::os::windows::io::IntoRawHandle; + + pub fn open_direct_fd(path: &std::path::Path) -> std::io::Result { + // Keep Windows path simple and robust across windows-sys versions. + // We intentionally avoid CreateFileW flags here to prevent API + // signature/feature drift from breaking compilation. + let file = std::fs::File::open(path)?; + Ok(file.into_raw_handle() as NativeFd) + } + + pub fn close_fd(fd: NativeFd) { + unsafe { + windows_sys::Win32::Foundation::CloseHandle(fd as _); + } + } + + pub fn read_at_fd(fd: NativeFd, dst: *mut u8, size: usize, file_offset: u64) -> isize { + use windows_sys::Win32::Storage::FileSystem::ReadFile; + use windows_sys::Win32::System::IO::OVERLAPPED; + + let mut overlapped: OVERLAPPED = unsafe { std::mem::zeroed() }; + // OVERLAPPED.Anonymous.Anonymous.{Offset, OffsetHigh} + overlapped.Anonymous.Anonymous.Offset = file_offset as u32; + overlapped.Anonymous.Anonymous.OffsetHigh = (file_offset >> 32) as u32; + + let read_size = size.min(u32::MAX as usize) as u32; + let mut bytes_read: u32 = 0; + + let ok = unsafe { + ReadFile( + fd as _, + dst as *mut _, + read_size, + &mut bytes_read, + &mut overlapped as *mut _, + ) + }; + + if ok == 0 { + let err = unsafe { windows_sys::Win32::Foundation::GetLastError() }; + const ERROR_HANDLE_EOF: u32 = 38; + if err == ERROR_HANDLE_EOF { + return 0; + } + return -1; + } + bytes_read as isize + } + + pub fn alloc_pages(size: usize) -> *mut u8 { + use windows_sys::Win32::System::Memory::{ + MEM_COMMIT, MEM_RESERVE, PAGE_READWRITE, VirtualAlloc, + }; + unsafe { + VirtualAlloc( + std::ptr::null(), + size, + MEM_COMMIT | MEM_RESERVE, + PAGE_READWRITE, + ) as *mut u8 + } + } + + pub fn free_pages(ptr: *mut u8, _size: usize) { + use windows_sys::Win32::System::Memory::{MEM_RELEASE, VirtualFree}; + if !ptr.is_null() { + unsafe { + VirtualFree(ptr as *mut _, 0, MEM_RELEASE); + } + } + } +} + +// ───────────────────────────────────────────────────────────── +// Page advisory (hint to OS that pages can be recycled) +// ───────────────────────────────────────────────────────────── + +/// Advise the OS that the pages at `[ptr, ptr+size)` are no longer needed +/// and can be reclaimed. The virtual address range remains valid. +/// +/// - Unix: `madvise(MADV_FREE)` (macOS) / `madvise(MADV_DONTNEED)` (Linux) +/// - Windows: `VirtualFree(MEM_DECOMMIT)` — decommits physical pages but +/// keeps the virtual reservation alive. +#[cfg(unix)] +pub fn advise_free_pages(ptr: *mut u8, size: usize) { + if ptr.is_null() || size == 0 { + return; + } + unsafe { + #[cfg(target_os = "macos")] + libc::madvise(ptr as *mut libc::c_void, size, libc::MADV_FREE); + + #[cfg(target_os = "linux")] + libc::madvise(ptr as *mut libc::c_void, size, libc::MADV_DONTNEED); + } +} + +#[cfg(windows)] +pub fn advise_free_pages(ptr: *mut u8, size: usize) { + if ptr.is_null() || size == 0 { + return; + } + unsafe { + // Decommit physical backing pages while keeping the virtual range reserved. + windows_sys::Win32::System::Memory::VirtualFree( + ptr as *mut _, + size, + windows_sys::Win32::System::Memory::MEM_DECOMMIT, + ); + } +} + +// ───────────────────────────────────────────────────────────── +// Re-export +// ───────────────────────────────────────────────────────────── +pub use imp::{alloc_pages, close_fd, free_pages, open_direct_fd, read_at_fd}; diff --git a/src/io/mod.rs b/src/io/mod.rs index 433ff8c..da880a2 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -1,4 +1,5 @@ pub mod aligned_buffer; pub mod async_reader; +pub mod compat; pub mod expert_layout; pub mod wear_monitor; diff --git a/src/profiler/cpu.rs b/src/profiler/cpu.rs index 18c1e9c..bacb53e 100644 --- a/src/profiler/cpu.rs +++ b/src/profiler/cpu.rs @@ -1,18 +1,13 @@ -use std::ffi::CStr; - use crate::profiler::types::CpuProfile; pub fn profile_cpu() -> anyhow::Result { - let model_name = sysctl_string("machdep.cpu.brand_string") - .unwrap_or_else(|_| "Unknown".to_string()); - - let total_cores = sysctl_u32("hw.ncpu").unwrap_or(1); - let cores_performance = sysctl_u32("hw.perflevel0.physicalcpu").unwrap_or(total_cores); - let cores_efficiency = sysctl_u32("hw.perflevel1.physicalcpu").unwrap_or(0); + let model_name = get_cpu_model_name(); + let (cores_performance, cores_efficiency) = get_core_counts(); - let is_apple_silicon = cfg!(target_arch = "aarch64") && model_name.contains("Apple"); + let is_apple_silicon = cfg!(all(target_os = "macos", target_arch = "aarch64")) + && model_name.contains("Apple"); - let int8_gflops = estimate_int8_gflops(&model_name); + let int8_gflops = estimate_int8_gflops(&model_name, cores_performance); Ok(CpuProfile { model_name, @@ -20,15 +15,98 @@ pub fn profile_cpu() -> anyhow::Result { cores_efficiency, has_amx: is_apple_silicon, has_neon: cfg!(target_arch = "aarch64"), - has_avx512: false, // Not on Apple Silicon - has_avx2: false, // Not on Apple Silicon + has_avx512: detect_avx512(), + has_avx2: detect_avx2(), int8_gflops, }) } -fn estimate_int8_gflops(model_name: &str) -> f64 { - // Ordered specific-first so "M4 Max" matches before "M4" - let specs: &[(&str, f64)] = &[ +// ── CPU model name ──────────────────────────────────────────────────────────── + +#[cfg(target_os = "macos")] +fn get_cpu_model_name() -> String { + sysctl_string("machdep.cpu.brand_string").unwrap_or_else(|_| "Unknown".to_string()) +} + +#[cfg(target_os = "linux")] +fn get_cpu_model_name() -> String { + // /proc/cpuinfo is authoritative on Linux / WSL2 + if let Ok(info) = std::fs::read_to_string("/proc/cpuinfo") { + for line in info.lines() { + if line.starts_with("model name") { + if let Some(name) = line.splitn(2, ':').nth(1) { + return name.trim().to_string(); + } + } + } + } + // Fallback: sysinfo brand string + let mut sys = sysinfo::System::new(); + sys.refresh_cpu_all(); + sys.cpus() + .first() + .map(|c| c.brand().to_string()) + .unwrap_or_else(|| "Unknown".to_string()) +} + +#[cfg(target_os = "windows")] +fn get_cpu_model_name() -> String { + let mut sys = sysinfo::System::new(); + sys.refresh_cpu_all(); + sys.cpus() + .first() + .map(|c| c.brand().to_string()) + .unwrap_or_else(|| "Unknown".to_string()) +} + +// ── Core counts ─────────────────────────────────────────────────────────────── + +#[cfg(target_os = "macos")] +fn get_core_counts() -> (u32, u32) { + let total = sysctl_u32("hw.ncpu").unwrap_or(1); + let perf = sysctl_u32("hw.perflevel0.physicalcpu").unwrap_or(total); + let eff = sysctl_u32("hw.perflevel1.physicalcpu").unwrap_or(0); + (perf, eff) +} + +#[cfg(not(target_os = "macos"))] +fn get_core_counts() -> (u32, u32) { + let mut sys = sysinfo::System::new(); + sys.refresh_cpu_all(); + let physical = sys.physical_core_count().unwrap_or( + std::thread::available_parallelism() + .map(|n| n.get() / 2) + .unwrap_or(2), + ) as u32; + // Windows/Linux don't expose P/E core split in a portable way + (physical, 0) +} + +// ── ISA extension detection ─────────────────────────────────────────────────── + +#[cfg(target_arch = "x86_64")] +fn detect_avx2() -> bool { + std::is_x86_feature_detected!("avx2") +} +#[cfg(not(target_arch = "x86_64"))] +fn detect_avx2() -> bool { + false +} + +#[cfg(target_arch = "x86_64")] +fn detect_avx512() -> bool { + std::is_x86_feature_detected!("avx512f") +} +#[cfg(not(target_arch = "x86_64"))] +fn detect_avx512() -> bool { + false +} + +// ── INT8 GFLOPS estimate ────────────────────────────────────────────────────── + +fn estimate_int8_gflops(model_name: &str, physical_cores: u32) -> f64 { + // Apple Silicon lookup (ordered specific-first so "M2 Max" > "M2") + let apple_specs: &[(&str, f64)] = &[ ("M5 Ultra", 44.0), ("M5 Max", 22.0), ("M5 Pro", 11.0), @@ -50,22 +128,35 @@ fn estimate_int8_gflops(model_name: &str) -> f64 { ("M1 Pro", 4.0), ("M1", 2.0), ]; - - for (pattern, gflops) in specs { + for (pattern, gflops) in apple_specs { if model_name.contains(pattern) { return *gflops; } } - tracing::warn!("Unknown CPU model '{model_name}', using conservative INT8 GFLOPS estimate"); - 2.0 + // x86: estimate from ISA width and core count + // AVX-512 VNNI: ~16 INT8 ops/cycle/core; AVX2 VPDPBUSD: ~8; baseline: ~4 + let cores = physical_cores.max(1) as f64; + if detect_avx512() { + // e.g., Intel Cascade Lake / Ice Lake: ~16 GFLOPS INT8/core @ 4 GHz + cores * 16.0 + } else if detect_avx2() { + // Ryzen 5000, Intel 10th gen+: ~8 GFLOPS INT8/core @ 4 GHz + cores * 8.0 + } else { + // Older or unknown CPU + cores * 4.0 + } } +// ── macOS sysctl helpers (compiled only on macOS) ───────────────────────────── + +#[cfg(target_os = "macos")] pub(crate) fn sysctl_string(name: &str) -> anyhow::Result { + use std::ffi::CStr; let c_name = std::ffi::CString::new(name)?; let mut size: libc::size_t = 0; - // First call to get size let ret = unsafe { libc::sysctlbyname(c_name.as_ptr(), std::ptr::null_mut(), &mut size, std::ptr::null_mut(), 0) }; @@ -88,6 +179,7 @@ pub(crate) fn sysctl_string(name: &str) -> anyhow::Result { Ok(cstr.to_string_lossy().to_string()) } +#[cfg(target_os = "macos")] pub(crate) fn sysctl_u32(name: &str) -> anyhow::Result { let c_name = std::ffi::CString::new(name)?; let mut value: u32 = 0; @@ -106,6 +198,19 @@ pub(crate) fn sysctl_u32(name: &str) -> anyhow::Result { Ok(value) } +/// Stub sysctl functions for non-macOS — callers already have `unwrap_or` fallbacks. +#[cfg(not(target_os = "macos"))] +pub(crate) fn sysctl_string(_name: &str) -> anyhow::Result { + anyhow::bail!("sysctl is not available on this platform") +} + +#[cfg(not(target_os = "macos"))] +pub(crate) fn sysctl_u32(_name: &str) -> anyhow::Result { + anyhow::bail!("sysctl is not available on this platform") +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + #[cfg(test)] mod tests { use super::*; @@ -121,10 +226,20 @@ mod tests { } #[test] - fn test_lookup_ordering() { - assert_eq!(estimate_int8_gflops("Apple M2 Max"), 12.0); - assert_eq!(estimate_int8_gflops("Apple M2 Pro"), 6.0); - assert_eq!(estimate_int8_gflops("Apple M2"), 3.0); - assert_eq!(estimate_int8_gflops("Apple M1 Max"), 10.0); + fn test_apple_silicon_lookup() { + // Only meaningful on macOS; on other platforms model_name won't match + assert_eq!(estimate_int8_gflops("Apple M2 Max", 12), 12.0); + assert_eq!(estimate_int8_gflops("Apple M1", 8), 2.0); + } + + #[test] + fn test_avx_detection_x86() { + #[cfg(target_arch = "x86_64")] + { + // AVX2 is present on virtually all CPUs from 2013+, but we can't + // assert true here — just make sure it doesn't panic. + let _ = detect_avx2(); + let _ = detect_avx512(); + } } } diff --git a/src/profiler/gpu.rs b/src/profiler/gpu.rs index 78c1e5d..26765de 100644 --- a/src/profiler/gpu.rs +++ b/src/profiler/gpu.rs @@ -2,37 +2,18 @@ use std::ffi::CStr; use crate::profiler::types::{GpuBackend, GpuProfile}; -struct AppleSiliconSpec { - pattern: &'static str, - bandwidth_gb_s: f64, - fp16_tflops: f64, +pub fn profile_gpu() -> anyhow::Result> { + #[cfg(target_os = "macos")] + return profile_gpu_metal(); + + #[cfg(not(target_os = "macos"))] + return profile_gpu_cuda_or_cpu(); } -// Ordered specific-first: "M2 Max" before "M2" -const APPLE_SILICON_SPECS: &[AppleSiliconSpec] = &[ - AppleSiliconSpec { pattern: "M5 Ultra", bandwidth_gb_s: 900.0, fp16_tflops: 40.0 }, - AppleSiliconSpec { pattern: "M5 Max", bandwidth_gb_s: 600.0, fp16_tflops: 20.0 }, - AppleSiliconSpec { pattern: "M5 Pro", bandwidth_gb_s: 300.0, fp16_tflops: 10.0 }, - AppleSiliconSpec { pattern: "M5", bandwidth_gb_s: 120.0, fp16_tflops: 4.5 }, - AppleSiliconSpec { pattern: "M4 Ultra", bandwidth_gb_s: 819.0, fp16_tflops: 36.0 }, - AppleSiliconSpec { pattern: "M4 Max", bandwidth_gb_s: 546.0, fp16_tflops: 18.0 }, - AppleSiliconSpec { pattern: "M4 Pro", bandwidth_gb_s: 273.0, fp16_tflops: 9.0 }, - AppleSiliconSpec { pattern: "M4", bandwidth_gb_s: 120.0, fp16_tflops: 4.0 }, - AppleSiliconSpec { pattern: "M3 Ultra", bandwidth_gb_s: 800.0, fp16_tflops: 28.0 }, - AppleSiliconSpec { pattern: "M3 Max", bandwidth_gb_s: 400.0, fp16_tflops: 14.0 }, - AppleSiliconSpec { pattern: "M3 Pro", bandwidth_gb_s: 150.0, fp16_tflops: 7.0 }, - AppleSiliconSpec { pattern: "M3", bandwidth_gb_s: 100.0, fp16_tflops: 3.5 }, - AppleSiliconSpec { pattern: "M2 Ultra", bandwidth_gb_s: 800.0, fp16_tflops: 27.2 }, - AppleSiliconSpec { pattern: "M2 Max", bandwidth_gb_s: 400.0, fp16_tflops: 13.6 }, - AppleSiliconSpec { pattern: "M2 Pro", bandwidth_gb_s: 200.0, fp16_tflops: 6.8 }, - AppleSiliconSpec { pattern: "M2", bandwidth_gb_s: 100.0, fp16_tflops: 3.6 }, - AppleSiliconSpec { pattern: "M1 Ultra", bandwidth_gb_s: 800.0, fp16_tflops: 20.8 }, - AppleSiliconSpec { pattern: "M1 Max", bandwidth_gb_s: 400.0, fp16_tflops: 10.4 }, - AppleSiliconSpec { pattern: "M1 Pro", bandwidth_gb_s: 200.0, fp16_tflops: 5.2 }, - AppleSiliconSpec { pattern: "M1", bandwidth_gb_s: 68.25, fp16_tflops: 2.6 }, -]; +// ── macOS / Metal ───────────────────────────────────────────────────────────── -pub fn profile_gpu() -> anyhow::Result> { +#[cfg(target_os = "macos")] +fn profile_gpu_metal() -> anyhow::Result> { let (name, vram_bytes) = match query_metal_device() { Ok(v) => v, Err(e) => { @@ -42,12 +23,9 @@ pub fn profile_gpu() -> anyhow::Result> { }; let (bandwidth, tflops) = match lookup_apple_silicon(&name) { - Some(spec) => ( - (spec.bandwidth_gb_s * 1e9) as u64, - spec.fp16_tflops, - ), + Some(spec) => ((spec.bandwidth_gb_s * 1e9) as u64, spec.fp16_tflops), None => { - tracing::warn!("Unknown GPU '{name}', using conservative estimates"); + tracing::warn!("Unknown Apple GPU '{name}', using conservative estimates"); (68_250_000_000u64, 2.6) } }; @@ -61,21 +39,17 @@ pub fn profile_gpu() -> anyhow::Result> { })) } +#[cfg(target_os = "macos")] fn query_metal_device() -> anyhow::Result<(String, u64)> { - unsafe { - hypura_sys::llama_backend_init(); - } + unsafe { hypura_sys::llama_backend_init() }; let result = (|| -> anyhow::Result<(String, u64)> { - // The Metal backend registers as "MTL", not "Metal" let reg_count = unsafe { hypura_sys::ggml_backend_reg_count() }; let mut reg = std::ptr::null_mut(); for i in 0..reg_count { let r = unsafe { hypura_sys::ggml_backend_reg_get(i) }; - if r.is_null() { - continue; - } + if r.is_null() { continue; } let name_ptr = unsafe { hypura_sys::ggml_backend_reg_name(r) }; if !name_ptr.is_null() { let name = unsafe { CStr::from_ptr(name_ptr) }.to_string_lossy(); @@ -93,7 +67,6 @@ fn query_metal_device() -> anyhow::Result<(String, u64)> { let device = unsafe { hypura_sys::ggml_backend_reg_dev_get(reg, 0) }; anyhow::ensure!(!device.is_null(), "Metal device is null"); - // Get device description (GPU name) let desc_ptr = unsafe { hypura_sys::ggml_backend_dev_description(device) }; let name = if desc_ptr.is_null() { "Unknown Metal GPU".to_string() @@ -101,58 +74,238 @@ fn query_metal_device() -> anyhow::Result<(String, u64)> { unsafe { CStr::from_ptr(desc_ptr) }.to_string_lossy().to_string() }; - // Get device memory let mut free: usize = 0; let mut total: usize = 0; - unsafe { - hypura_sys::ggml_backend_dev_memory(device, &mut free, &mut total); - } + unsafe { hypura_sys::ggml_backend_dev_memory(device, &mut free, &mut total) }; Ok((name, total as u64)) })(); - unsafe { - hypura_sys::llama_backend_free(); + unsafe { hypura_sys::llama_backend_free() }; + result +} + +// ── Non-macOS: CUDA / CPU-only ──────────────────────────────────────────────── + +#[cfg(not(target_os = "macos"))] +fn profile_gpu_cuda_or_cpu() -> anyhow::Result> { + // Query via the ggml CUDA backend if available + let gpu = query_cuda_device(); + if let Some((name, vram_bytes)) = gpu { + let (bandwidth, tflops) = lookup_nvidia_gpu(&name) + .unwrap_or_else(|| estimate_nvidia_gpu(vram_bytes)); + return Ok(Some(GpuProfile { + name, + vram_bytes, + bandwidth_bytes_per_sec: bandwidth, + fp16_tflops: tflops, + backend: GpuBackend::Cuda, + })); } + tracing::debug!("No CUDA GPU detected; running CPU-only"); + Ok(None) +} + +#[cfg(not(target_os = "macos"))] +fn query_cuda_device() -> Option<(String, u64)> { + unsafe { hypura_sys::llama_backend_init() }; + + let result = (|| -> Option<(String, u64)> { + let reg_count = unsafe { hypura_sys::ggml_backend_reg_count() }; + let mut cuda_reg = std::ptr::null_mut(); + for i in 0..reg_count { + let r = unsafe { hypura_sys::ggml_backend_reg_get(i) }; + if r.is_null() { continue; } + let name_ptr = unsafe { hypura_sys::ggml_backend_reg_name(r) }; + if !name_ptr.is_null() { + let name = unsafe { CStr::from_ptr(name_ptr) }.to_string_lossy(); + if name.contains("CUDA") || name.contains("NVIDIA") { + cuda_reg = r; + break; + } + } + } + if cuda_reg.is_null() { + return None; + } + + let dev_count = unsafe { hypura_sys::ggml_backend_reg_dev_count(cuda_reg) }; + if dev_count == 0 { + return None; + } + + let device = unsafe { hypura_sys::ggml_backend_reg_dev_get(cuda_reg, 0) }; + if device.is_null() { + return None; + } + + let desc_ptr = unsafe { hypura_sys::ggml_backend_dev_description(device) }; + let name = if desc_ptr.is_null() { + "Unknown NVIDIA GPU".to_string() + } else { + unsafe { CStr::from_ptr(desc_ptr) }.to_string_lossy().to_string() + }; + + let mut free: usize = 0; + let mut total: usize = 0; + unsafe { hypura_sys::ggml_backend_dev_memory(device, &mut free, &mut total) }; + + Some((name, total as u64)) + })(); + + unsafe { hypura_sys::llama_backend_free() }; result } +// ── NVIDIA GPU spec database ────────────────────────────────────────────────── +// (bandwidth_gb_s, fp16_tflops) — ordered most-specific first to avoid +// "RTX 3060" matching before "RTX 3060 Ti". + +struct NvidiaSpec { + pattern: &'static str, + bandwidth_gb_s: f64, + fp16_tflops: f64, +} + +const NVIDIA_SPECS: &[NvidiaSpec] = &[ + // ── Blackwell (RTX 50xx) ──────────────────────────────────────────────── + NvidiaSpec { pattern: "RTX 5090", bandwidth_gb_s: 1792.0, fp16_tflops: 838.0 }, + NvidiaSpec { pattern: "RTX 5080", bandwidth_gb_s: 960.0, fp16_tflops: 464.0 }, + NvidiaSpec { pattern: "RTX 5070 Ti", bandwidth_gb_s: 896.0, fp16_tflops: 228.0 }, + NvidiaSpec { pattern: "RTX 5070", bandwidth_gb_s: 672.0, fp16_tflops: 176.0 }, + NvidiaSpec { pattern: "RTX 5060 Ti", bandwidth_gb_s: 576.0, fp16_tflops: 129.0 }, + NvidiaSpec { pattern: "RTX 5060", bandwidth_gb_s: 448.0, fp16_tflops: 92.0 }, + // ── Ada Lovelace (RTX 40xx) ───────────────────────────────────────────── + NvidiaSpec { pattern: "RTX 4090", bandwidth_gb_s: 1008.0, fp16_tflops: 165.2 }, + NvidiaSpec { pattern: "RTX 4080 Super",bandwidth_gb_s: 736.0, fp16_tflops: 103.9 }, + NvidiaSpec { pattern: "RTX 4080", bandwidth_gb_s: 717.0, fp16_tflops: 97.5 }, + NvidiaSpec { pattern: "RTX 4070 Ti Super", bandwidth_gb_s: 672.0, fp16_tflops: 88.9 }, + NvidiaSpec { pattern: "RTX 4070 Ti", bandwidth_gb_s: 504.0, fp16_tflops: 80.8 }, + NvidiaSpec { pattern: "RTX 4070 Super",bandwidth_gb_s: 504.0, fp16_tflops: 71.2 }, + NvidiaSpec { pattern: "RTX 4070", bandwidth_gb_s: 504.0, fp16_tflops: 58.0 }, + NvidiaSpec { pattern: "RTX 4060 Ti", bandwidth_gb_s: 288.0, fp16_tflops: 45.2 }, + NvidiaSpec { pattern: "RTX 4060", bandwidth_gb_s: 272.0, fp16_tflops: 30.1 }, + NvidiaSpec { pattern: "RTX 4050", bandwidth_gb_s: 192.0, fp16_tflops: 24.2 }, + // ── Ampere (RTX 30xx) ─────────────────────────────────────────────────── + NvidiaSpec { pattern: "RTX 3090 Ti", bandwidth_gb_s: 1008.0, fp16_tflops: 80.0 }, + NvidiaSpec { pattern: "RTX 3090", bandwidth_gb_s: 936.0, fp16_tflops: 71.0 }, + NvidiaSpec { pattern: "RTX 3080 Ti", bandwidth_gb_s: 912.0, fp16_tflops: 65.0 }, + NvidiaSpec { pattern: "RTX 3080 12GB", bandwidth_gb_s: 912.0, fp16_tflops: 60.0 }, + NvidiaSpec { pattern: "RTX 3080", bandwidth_gb_s: 760.0, fp16_tflops: 59.0 }, + NvidiaSpec { pattern: "RTX 3070 Ti", bandwidth_gb_s: 608.0, fp16_tflops: 43.5 }, + NvidiaSpec { pattern: "RTX 3070", bandwidth_gb_s: 448.0, fp16_tflops: 32.0 }, + NvidiaSpec { pattern: "RTX 3060 Ti", bandwidth_gb_s: 448.0, fp16_tflops: 29.4 }, + NvidiaSpec { pattern: "RTX 3060 12GB", bandwidth_gb_s: 360.0, fp16_tflops: 25.4 }, + NvidiaSpec { pattern: "RTX 3060", bandwidth_gb_s: 360.0, fp16_tflops: 25.4 }, // base target + NvidiaSpec { pattern: "RTX 3050", bandwidth_gb_s: 224.0, fp16_tflops: 16.0 }, + // ── Turing (RTX 20xx) ─────────────────────────────────────────────────── + NvidiaSpec { pattern: "RTX 2080 Ti", bandwidth_gb_s: 616.0, fp16_tflops: 53.8 }, + NvidiaSpec { pattern: "RTX 2080 Super",bandwidth_gb_s: 496.0, fp16_tflops: 43.6 }, + NvidiaSpec { pattern: "RTX 2080", bandwidth_gb_s: 448.0, fp16_tflops: 40.5 }, + NvidiaSpec { pattern: "RTX 2070 Super",bandwidth_gb_s: 448.0, fp16_tflops: 36.9 }, + NvidiaSpec { pattern: "RTX 2070", bandwidth_gb_s: 448.0, fp16_tflops: 28.9 }, + NvidiaSpec { pattern: "RTX 2060 Super",bandwidth_gb_s: 448.0, fp16_tflops: 26.6 }, + NvidiaSpec { pattern: "RTX 2060", bandwidth_gb_s: 336.0, fp16_tflops: 21.2 }, + // ── Data centre / professional ────────────────────────────────────────── + NvidiaSpec { pattern: "H200", bandwidth_gb_s: 4800.0, fp16_tflops: 1979.0 }, + NvidiaSpec { pattern: "H100 SXM", bandwidth_gb_s: 3350.0, fp16_tflops: 1979.0 }, + NvidiaSpec { pattern: "H100", bandwidth_gb_s: 2000.0, fp16_tflops: 1979.0 }, + NvidiaSpec { pattern: "A100 SXM 80GB", bandwidth_gb_s: 2000.0, fp16_tflops: 312.0 }, + NvidiaSpec { pattern: "A100 80GB", bandwidth_gb_s: 1935.0, fp16_tflops: 312.0 }, + NvidiaSpec { pattern: "A100", bandwidth_gb_s: 1555.0, fp16_tflops: 312.0 }, + NvidiaSpec { pattern: "L40S", bandwidth_gb_s: 864.0, fp16_tflops: 366.0 }, + NvidiaSpec { pattern: "L40", bandwidth_gb_s: 864.0, fp16_tflops: 181.0 }, + NvidiaSpec { pattern: "A40", bandwidth_gb_s: 696.0, fp16_tflops: 149.7 }, +]; + +fn lookup_nvidia_gpu(name: &str) -> Option<(u64, f64)> { + NVIDIA_SPECS.iter().find(|s| name.contains(s.pattern)).map(|s| { + ((s.bandwidth_gb_s * 1e9) as u64, s.fp16_tflops) + }) +} + +/// Conservative estimate when the GPU model isn't in our database. +fn estimate_nvidia_gpu(vram_bytes: u64) -> (u64, f64) { + // Very rough: ~1 TB/s bandwidth per 24 GB VRAM, ~100 TFLOPS FP16 per 24 GB + let gb = vram_bytes as f64 / 1e9; + let bw = (gb / 24.0 * 1_000_000_000_000.0) as u64; + let tflops = gb / 24.0 * 100.0; + (bw.max(200_000_000_000), tflops.max(5.0)) +} + +// ── Apple Silicon spec database ─────────────────────────────────────────────── + +#[cfg(target_os = "macos")] +struct AppleSiliconSpec { + pattern: &'static str, + bandwidth_gb_s: f64, + fp16_tflops: f64, +} + +#[cfg(target_os = "macos")] +const APPLE_SILICON_SPECS: &[AppleSiliconSpec] = &[ + AppleSiliconSpec { pattern: "M5 Ultra", bandwidth_gb_s: 900.0, fp16_tflops: 40.0 }, + AppleSiliconSpec { pattern: "M5 Max", bandwidth_gb_s: 600.0, fp16_tflops: 20.0 }, + AppleSiliconSpec { pattern: "M5 Pro", bandwidth_gb_s: 300.0, fp16_tflops: 10.0 }, + AppleSiliconSpec { pattern: "M5", bandwidth_gb_s: 120.0, fp16_tflops: 4.5 }, + AppleSiliconSpec { pattern: "M4 Ultra", bandwidth_gb_s: 819.0, fp16_tflops: 36.0 }, + AppleSiliconSpec { pattern: "M4 Max", bandwidth_gb_s: 546.0, fp16_tflops: 18.0 }, + AppleSiliconSpec { pattern: "M4 Pro", bandwidth_gb_s: 273.0, fp16_tflops: 9.0 }, + AppleSiliconSpec { pattern: "M4", bandwidth_gb_s: 120.0, fp16_tflops: 4.0 }, + AppleSiliconSpec { pattern: "M3 Ultra", bandwidth_gb_s: 800.0, fp16_tflops: 28.0 }, + AppleSiliconSpec { pattern: "M3 Max", bandwidth_gb_s: 400.0, fp16_tflops: 14.0 }, + AppleSiliconSpec { pattern: "M3 Pro", bandwidth_gb_s: 150.0, fp16_tflops: 7.0 }, + AppleSiliconSpec { pattern: "M3", bandwidth_gb_s: 100.0, fp16_tflops: 3.5 }, + AppleSiliconSpec { pattern: "M2 Ultra", bandwidth_gb_s: 800.0, fp16_tflops: 27.2 }, + AppleSiliconSpec { pattern: "M2 Max", bandwidth_gb_s: 400.0, fp16_tflops: 13.6 }, + AppleSiliconSpec { pattern: "M2 Pro", bandwidth_gb_s: 200.0, fp16_tflops: 6.8 }, + AppleSiliconSpec { pattern: "M2", bandwidth_gb_s: 100.0, fp16_tflops: 3.6 }, + AppleSiliconSpec { pattern: "M1 Ultra", bandwidth_gb_s: 800.0, fp16_tflops: 20.8 }, + AppleSiliconSpec { pattern: "M1 Max", bandwidth_gb_s: 400.0, fp16_tflops: 10.4 }, + AppleSiliconSpec { pattern: "M1 Pro", bandwidth_gb_s: 200.0, fp16_tflops: 5.2 }, + AppleSiliconSpec { pattern: "M1", bandwidth_gb_s: 68.25, fp16_tflops: 2.6 }, +]; + +#[cfg(target_os = "macos")] fn lookup_apple_silicon(name: &str) -> Option<&'static AppleSiliconSpec> { - APPLE_SILICON_SPECS.iter().find(|spec| name.contains(spec.pattern)) + APPLE_SILICON_SPECS.iter().find(|s| name.contains(s.pattern)) } +// ── Tests ───────────────────────────────────────────────────────────────────── + #[cfg(test)] mod tests { use super::*; #[test] - fn test_lookup_specificity() { - let spec = lookup_apple_silicon("Apple M2 Max").unwrap(); - assert_eq!(spec.pattern, "M2 Max"); - assert!((spec.fp16_tflops - 13.6).abs() < 0.01); - - let spec = lookup_apple_silicon("Apple M2").unwrap(); - assert_eq!(spec.pattern, "M2"); + fn test_lookup_rtx3060() { + let spec = lookup_nvidia_gpu("NVIDIA GeForce RTX 3060"); + assert!(spec.is_some()); + let (bw, tflops) = spec.unwrap(); + assert!(bw > 300_000_000_000); // > 300 GB/s + assert!(tflops > 20.0); + } - let spec = lookup_apple_silicon("Apple M1 Pro").unwrap(); - assert_eq!(spec.pattern, "M1 Pro"); + #[test] + fn test_lookup_rtx4090() { + let (bw, tflops) = lookup_nvidia_gpu("NVIDIA GeForce RTX 4090").unwrap(); + assert!(tflops > 100.0); + assert!(bw > 900_000_000_000); } #[test] fn test_lookup_unknown() { - assert!(lookup_apple_silicon("NVIDIA RTX 4090").is_none()); + // Unknown GPU falls back to estimate + let (bw, tflops) = estimate_nvidia_gpu(12 * 1024 * 1024 * 1024); + assert!(bw > 0); + assert!(tflops > 0.0); } + #[cfg(target_os = "macos")] #[test] - fn test_profile_gpu_returns_some() { - let gpu = profile_gpu().unwrap(); - // On Apple Silicon, we should always get a GPU - if cfg!(target_arch = "aarch64") { - assert!(gpu.is_some()); - let gpu = gpu.unwrap(); - assert!(!gpu.name.is_empty()); - assert!(gpu.vram_bytes > 0); - } + fn test_lookup_apple() { + let spec = lookup_apple_silicon("Apple M2 Max").unwrap(); + assert!((spec.fp16_tflops - 13.6).abs() < 0.01); } } diff --git a/src/profiler/mod.rs b/src/profiler/mod.rs index 719ffe2..6feab01 100644 --- a/src/profiler/mod.rs +++ b/src/profiler/mod.rs @@ -4,8 +4,6 @@ pub mod memory; pub mod storage; pub mod types; -use std::path::PathBuf; - use chrono::Utc; use crate::profiler::types::{HardwareProfile, SystemInfo}; @@ -13,7 +11,7 @@ use crate::profiler::types::{HardwareProfile, SystemInfo}; /// Run the full hardware profiling suite. pub fn run_full_profile() -> anyhow::Result { tracing::info!("Profiling CPU..."); - let cpu = cpu::profile_cpu()?; + let cpu_profile = cpu::profile_cpu()?; tracing::info!("Profiling memory..."); let memory_profile = memory::profile_memory()?; @@ -24,14 +22,11 @@ pub fn run_full_profile() -> anyhow::Result { tracing::info!("Profiling storage..."); let storage = storage::profile_storage()?; - let total_cores = cpu::sysctl_u32("hw.ncpu").unwrap_or(1); - let machine_model = cpu::sysctl_string("hw.model").unwrap_or_else(|_| "Unknown".into()); - let system = SystemInfo { os: format!("{} {}", std::env::consts::OS, os_version()), arch: std::env::consts::ARCH.to_string(), - machine_model, - total_cores, + machine_model: machine_model(), + total_cores: total_cpu_count(), }; Ok(HardwareProfile { @@ -40,21 +35,24 @@ pub fn run_full_profile() -> anyhow::Result { memory: memory_profile, gpu, storage, - cpu, + cpu: cpu_profile, }) } -/// Returns the path to `~/.hypura/`, creating it if necessary. -pub fn profile_dir() -> anyhow::Result { - let dir = dirs_path(); +/// Returns the path to the Hypura data directory, creating it if necessary. +/// +/// - Windows: `%APPDATA%\Hypura` +/// - macOS / Linux: `~/.hypura` +pub fn profile_dir() -> anyhow::Result { + let dir = data_dir(); if !dir.exists() { std::fs::create_dir_all(&dir)?; } Ok(dir) } -/// Save a hardware profile to `~/.hypura/hardware_profile.json`. -pub fn save_profile(profile: &HardwareProfile) -> anyhow::Result { +/// Save a hardware profile to `/hardware_profile.json`. +pub fn save_profile(profile: &HardwareProfile) -> anyhow::Result { let dir = profile_dir()?; let path = dir.join("hardware_profile.json"); let json = serde_json::to_string_pretty(profile)?; @@ -64,7 +62,7 @@ pub fn save_profile(profile: &HardwareProfile) -> anyhow::Result { /// Load a cached hardware profile, if one exists. pub fn load_cached_profile() -> anyhow::Result> { - let path = dirs_path().join("hardware_profile.json"); + let path = data_dir().join("hardware_profile.json"); if !path.exists() { return Ok(None); } @@ -79,11 +77,83 @@ pub fn is_profile_stale(profile: &HardwareProfile) -> bool { age.num_days() > 30 } -fn dirs_path() -> PathBuf { - let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".into()); - PathBuf::from(home).join(".hypura") +// ── Helpers ─────────────────────────────────────────────────────────────────── + +/// Platform-specific data directory. +fn data_dir() -> std::path::PathBuf { + #[cfg(target_os = "windows")] + { + let appdata = std::env::var("APPDATA").unwrap_or_else(|_| "C:\\Users\\Default\\AppData\\Roaming".into()); + std::path::PathBuf::from(appdata).join("Hypura") + } + #[cfg(not(target_os = "windows"))] + { + let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".into()); + std::path::PathBuf::from(home).join(".hypura") + } +} + +/// Total logical CPU count. +fn total_cpu_count() -> u32 { + #[cfg(target_os = "macos")] + { + cpu::sysctl_u32("hw.ncpu").unwrap_or(1) + } + #[cfg(not(target_os = "macos"))] + { + let mut sys = sysinfo::System::new(); + sys.refresh_cpu_all(); + sys.cpus().len() as u32 + } } +/// Human-readable machine / system model string. +fn machine_model() -> String { + #[cfg(target_os = "macos")] + { + cpu::sysctl_string("hw.model").unwrap_or_else(|_| "Unknown".into()) + } + #[cfg(target_os = "linux")] + { + // WSL2 exposes DMI product name in /sys + if let Ok(model) = std::fs::read_to_string("/sys/class/dmi/id/product_name") { + let trimmed = model.trim().to_string(); + if !trimmed.is_empty() && trimmed != "None" { + return trimmed; + } + } + "Unknown Linux machine".to_string() + } + #[cfg(target_os = "windows")] + { + // Could query WMI here; for now return a static string + "Windows PC".to_string() + } +} + +/// OS version string. fn os_version() -> String { - cpu::sysctl_string("kern.osproductversion").unwrap_or_else(|_| "unknown".into()) + #[cfg(target_os = "macos")] + { + cpu::sysctl_string("kern.osproductversion").unwrap_or_else(|_| "unknown".into()) + } + #[cfg(target_os = "linux")] + { + // /etc/os-release is the standard on modern Linux distros + if let Ok(content) = std::fs::read_to_string("/etc/os-release") { + for line in content.lines() { + if let Some(val) = line.strip_prefix("PRETTY_NAME=") { + return val.trim_matches('"').to_string(); + } + } + } + // Fallback: kernel version + let mut uname = sysinfo::System::kernel_version() + .unwrap_or_else(|| "unknown".to_string()); + uname + } + #[cfg(target_os = "windows")] + { + sysinfo::System::os_version().unwrap_or_else(|| "unknown".to_string()) + } } diff --git a/src/profiler/storage.rs b/src/profiler/storage.rs index a712df1..509aad1 100644 --- a/src/profiler/storage.rs +++ b/src/profiler/storage.rs @@ -1,10 +1,9 @@ use std::io::Write; -use std::os::unix::io::AsRawFd; use std::time::Instant; use crate::profiler::types::{BandwidthCurve, StorageProfile, StorageType}; -const BLOCK_SIZES: &[usize] = &[4096, 65536, 131072, 1_048_576, 4_194_304]; +const BLOCK_SIZES: &[usize] = &[4096, 65536, 131_072, 1_048_576, 4_194_304]; const SEQUENTIAL_PASSES: usize = 3; const RANDOM_IOPS_READS: usize = 10_000; @@ -14,8 +13,8 @@ pub fn profile_storage() -> anyhow::Result> { for disk in disks.list() { let mount = disk.mount_point().to_string_lossy().to_string(); - // Only benchmark the root volume (or Data volume on APFS) - if mount != "/" && mount != "/System/Volumes/Data" { + + if !is_primary_volume(&mount) { continue; } @@ -36,7 +35,7 @@ pub fn profile_storage() -> anyhow::Result> { profiles.push(StorageProfile { device_path, mount_point: mount, - device_type: StorageType::NvmePcie, // All internal Apple Silicon storage is NVMe + device_type: detect_storage_type(disk), capacity_bytes, free_bytes, sequential_read, @@ -45,33 +44,53 @@ pub fn profile_storage() -> anyhow::Result> { wear_level: None, }); - break; // Only benchmark the first root volume + break; // Benchmark only the first matching volume } anyhow::ensure!(!profiles.is_empty(), "No storage devices found to benchmark"); Ok(profiles) } +/// Returns true for the volume that should be benchmarked. +fn is_primary_volume(mount: &str) -> bool { + #[cfg(target_os = "macos")] + { + mount == "/" || mount == "/System/Volumes/Data" + } + #[cfg(target_os = "linux")] + { + // On Linux / WSL2, benchmark the root filesystem + mount == "/" + } + #[cfg(target_os = "windows")] + { + // On Windows, take the first fixed drive (usually C:\) + mount.ends_with('\\') && mount.len() == 3 + } +} + +fn detect_storage_type(disk: &sysinfo::Disk) -> StorageType { + use sysinfo::DiskKind; + match disk.kind() { + DiskKind::SSD | DiskKind::Unknown(_) => StorageType::NvmePcie, + DiskKind::HDD => StorageType::Sata, + } +} + fn benchmark_storage(mount_point: &str, free_bytes: u64) -> anyhow::Result<(BandwidthCurve, u64)> { - // Size temp file: 1 GiB if space allows, 256 MiB otherwise let file_size: usize = if free_bytes > 5 * (1 << 30) { 1 << 30 // 1 GiB } else { 256 << 20 // 256 MiB }; - // Create temp file with data - let temp_dir = if mount_point == "/System/Volumes/Data" { - std::env::temp_dir() - } else { - std::path::PathBuf::from(mount_point).join("tmp") - }; + let temp_dir = pick_temp_dir(mount_point); let temp_path = temp_dir.join(".hypura_bench_tmp"); // Write test data { let mut f = std::fs::File::create(&temp_path)?; - let pattern = vec![0xA5u8; 1 << 20]; // 1 MiB pattern + let pattern = vec![0xA5u8; 1 << 20]; // 1 MiB chunks let chunks = file_size / pattern.len(); for _ in 0..chunks { f.write_all(&pattern)?; @@ -85,12 +104,26 @@ fn benchmark_storage(mount_point: &str, free_bytes: u64) -> anyhow::Result<(Band Ok((sequential, iops)) })(); - // Clean up let _ = std::fs::remove_file(&temp_path); - result } +fn pick_temp_dir(mount_point: &str) -> std::path::PathBuf { + #[cfg(target_os = "macos")] + if mount_point == "/System/Volumes/Data" { + return std::env::temp_dir(); + } + + let candidate = std::path::PathBuf::from(mount_point); + if candidate.exists() { + candidate + } else { + std::env::temp_dir() + } +} + +// ── Sequential read benchmark ───────────────────────────────────────────────── + fn benchmark_sequential( path: &std::path::Path, file_size: usize, @@ -106,149 +139,191 @@ fn benchmark_sequential( let mut trial_bandwidths = Vec::with_capacity(SEQUENTIAL_PASSES); for _ in 0..SEQUENTIAL_PASSES { - let file = std::fs::File::open(path)?; - let fd = file.as_raw_fd(); - - // Bypass filesystem cache - let ret = unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - anyhow::ensure!(ret != -1, "F_NOCACHE failed: {}", std::io::Error::last_os_error()); - - let mut buf = AlignedBuffer::new(block_size, 4096)?; - let mut total_read: usize = 0; - - let start = Instant::now(); - while total_read < file_size { - let to_read = block_size.min(file_size - total_read); - let n = unsafe { - libc::pread( - fd, - buf.as_mut_ptr() as *mut libc::c_void, - to_read, - total_read as libc::off_t, - ) - }; - if n <= 0 { - break; - } - total_read += n as usize; - } - let elapsed = start.elapsed().as_secs_f64(); - - if elapsed > 0.0 { - let bandwidth = (total_read as f64 / elapsed) as u64; - trial_bandwidths.push(bandwidth); + let bw = read_sequential_pass(path, file_size, block_size)?; + if bw > 0 { + trial_bandwidths.push(bw); } } if !trial_bandwidths.is_empty() { - trial_bandwidths.sort(); + trial_bandwidths.sort_unstable(); let median = trial_bandwidths[trial_bandwidths.len() / 2]; points.push((block_size as u64, median)); peak_sequential = peak_sequential.max(median); } } - Ok(BandwidthCurve { - points, - peak_sequential, - }) + Ok(BandwidthCurve { points, peak_sequential }) +} + +/// Platform-specific sequential read pass. +fn read_sequential_pass( + path: &std::path::Path, + file_size: usize, + block_size: usize, +) -> anyhow::Result { + #[cfg(unix)] + { + read_sequential_unix(path, file_size, block_size) + } + #[cfg(windows)] + { + read_sequential_windows(path, file_size, block_size) + } } -fn benchmark_random_4k( +#[cfg(unix)] +fn read_sequential_unix( path: &std::path::Path, file_size: usize, + block_size: usize, ) -> anyhow::Result { + use crate::io::aligned_buffer::AlignedBuffer; + use std::os::unix::io::AsRawFd; + let file = std::fs::File::open(path)?; let fd = file.as_raw_fd(); - let ret = unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1) }; - anyhow::ensure!(ret != -1, "F_NOCACHE failed: {}", std::io::Error::last_os_error()); - - let mut buf = AlignedBuffer::new(4096, 4096)?; - let max_offset = (file_size / 4096) as u64; + // macOS: disable unified buffer cache; Linux: use advisory fadvise + #[cfg(target_os = "macos")] + unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1); } + #[cfg(target_os = "linux")] + unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_DONTNEED); } - // Simple LCG for pseudo-random offsets (avoids rand crate dependency) - let mut rng_state: u64 = 0xDEAD_BEEF_CAFE_BABEu64; + let mut buf = AlignedBuffer::new(block_size, 4096)?; + let mut total_read: usize = 0; let start = Instant::now(); - for _ in 0..RANDOM_IOPS_READS { - rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); - let block_idx = (rng_state >> 32) % max_offset; - let offset = block_idx * 4096; - + while total_read < file_size { + let to_read = block_size.min(file_size - total_read); let n = unsafe { libc::pread( fd, buf.as_mut_ptr() as *mut libc::c_void, - 4096, - offset as libc::off_t, + to_read, + total_read as libc::off_t, ) }; - if n <= 0 { - break; - } + if n <= 0 { break; } + total_read += n as usize; } let elapsed = start.elapsed().as_secs_f64(); - let iops = if elapsed > 0.0 { - (RANDOM_IOPS_READS as f64 / elapsed) as u64 + if elapsed > 0.0 { + Ok((total_read as f64 / elapsed) as u64) } else { - 0 - }; + Ok(0) + } +} + +#[cfg(windows)] +fn read_sequential_windows( + path: &std::path::Path, + file_size: usize, + block_size: usize, +) -> anyhow::Result { + use std::io::{Read, Seek, SeekFrom}; + use crate::io::aligned_buffer::AlignedBuffer; + + let mut file = std::fs::File::open(path)?; + file.seek(SeekFrom::Start(0))?; + let mut buf = AlignedBuffer::new(block_size, 4096)?; + let mut total_read: usize = 0; - Ok(iops) + let start = Instant::now(); + while total_read < file_size { + let to_read = block_size.min(file_size - total_read); + let n = file.read(&mut buf[..to_read])?; + if n == 0 { break; } + total_read += n; + } + let elapsed = start.elapsed().as_secs_f64(); + + if elapsed > 0.0 { + Ok((total_read as f64 / elapsed) as u64) + } else { + Ok(0) + } } -/// Page-aligned buffer for direct I/O. -struct AlignedBuffer { - ptr: *mut u8, - _len: usize, +// ── Random 4K IOPS benchmark ────────────────────────────────────────────────── + +fn benchmark_random_4k(path: &std::path::Path, file_size: usize) -> anyhow::Result { + #[cfg(unix)] + return benchmark_random_4k_unix(path, file_size); + #[cfg(windows)] + return benchmark_random_4k_windows(path, file_size); } -impl AlignedBuffer { - fn new(size: usize, alignment: usize) -> anyhow::Result { - let mut ptr: *mut libc::c_void = std::ptr::null_mut(); - let ret = unsafe { libc::posix_memalign(&mut ptr, alignment, size) }; - anyhow::ensure!(ret == 0, "posix_memalign failed: error code {ret}"); - Ok(Self { - ptr: ptr as *mut u8, - _len: size, - }) - } +#[cfg(unix)] +fn benchmark_random_4k_unix(path: &std::path::Path, file_size: usize) -> anyhow::Result { + use crate::io::aligned_buffer::AlignedBuffer; + use std::os::unix::io::AsRawFd; + + let file = std::fs::File::open(path)?; + let fd = file.as_raw_fd(); + + #[cfg(target_os = "macos")] + unsafe { libc::fcntl(fd, libc::F_NOCACHE, 1); } + #[cfg(target_os = "linux")] + unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_RANDOM); } + + let mut buf = AlignedBuffer::new(4096, 4096)?; + let max_offset = (file_size / 4096) as u64; + + let mut rng: u64 = 0xDEAD_BEEF_CAFE_BABEu64; + let start = Instant::now(); + for _ in 0..RANDOM_IOPS_READS { + rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + let block_idx = (rng >> 32) % max_offset; + let offset = block_idx * 4096; - fn as_mut_ptr(&mut self) -> *mut u8 { - self.ptr + let n = unsafe { + libc::pread(fd, buf.as_mut_ptr() as *mut libc::c_void, 4096, offset as libc::off_t) + }; + if n <= 0 { break; } } + let elapsed = start.elapsed().as_secs_f64(); + + Ok(if elapsed > 0.0 { (RANDOM_IOPS_READS as f64 / elapsed) as u64 } else { 0 }) } -impl Drop for AlignedBuffer { - fn drop(&mut self) { - if !self.ptr.is_null() { - unsafe { - libc::free(self.ptr as *mut libc::c_void); - } - } +#[cfg(windows)] +fn benchmark_random_4k_windows(path: &std::path::Path, file_size: usize) -> anyhow::Result { + use std::io::{Read, Seek, SeekFrom}; + use crate::io::aligned_buffer::AlignedBuffer; + + let mut file = std::fs::File::open(path)?; + let mut buf = AlignedBuffer::new(4096, 4096)?; + let max_offset = (file_size / 4096) as u64; + + let mut rng: u64 = 0xDEAD_BEEF_CAFE_BABEu64; + let start = Instant::now(); + for _ in 0..RANDOM_IOPS_READS { + rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + let block_idx = (rng >> 32) % max_offset; + let offset = block_idx * 4096; + file.seek(SeekFrom::Start(offset))?; + let _ = file.read(&mut buf[..4096]); } + let elapsed = start.elapsed().as_secs_f64(); + + Ok(if elapsed > 0.0 { (RANDOM_IOPS_READS as f64 / elapsed) as u64 } else { 0 }) } +// ── Tests ───────────────────────────────────────────────────────────────────── + #[cfg(test)] mod tests { use super::*; - #[test] - fn test_aligned_buffer() { - let mut buf = AlignedBuffer::new(4096, 4096).unwrap(); - assert!(!buf.as_mut_ptr().is_null()); - assert_eq!(buf.as_mut_ptr() as usize % 4096, 0); - } - #[test] fn test_profile_storage() { let profiles = profile_storage().unwrap(); assert!(!profiles.is_empty()); let p = &profiles[0]; assert!(p.capacity_bytes > 0); - assert!(p.sequential_read.peak_sequential > 100_000_000); // > 100 MB/s + assert!(p.sequential_read.peak_sequential > 50_000_000); // > 50 MB/s assert!(p.random_read_iops > 0); } } diff --git a/src/scheduler/placement.rs b/src/scheduler/placement.rs index 5206db7..83227b1 100644 --- a/src/scheduler/placement.rs +++ b/src/scheduler/placement.rs @@ -9,8 +9,26 @@ use crate::profiler::types::HardwareProfile; use crate::scheduler::prefetch::build_prefetch_schedule; use crate::scheduler::types::*; -const OS_OVERHEAD: u64 = 2 * (1 << 30); // 2 GiB reserved for macOS -const GPU_RUNTIME_OVERHEAD: u64 = 1 << 30; // 1 GiB reserved for compute buffers + Metal overhead (actual usage ~362 MiB) +/// RAM reserved for the OS and background processes. +/// +/// macOS: ~2 GiB (kernel + system agents). +/// Windows: ~4 GiB (kernel + system processes tend to use more). +/// Linux / WSL2: ~1 GiB conservative estimate. +#[cfg(target_os = "macos")] +const OS_OVERHEAD: u64 = 2 * (1 << 30); // 2 GiB +#[cfg(target_os = "windows")] +const OS_OVERHEAD: u64 = 4 * (1 << 30); // 4 GiB +#[cfg(all(not(target_os = "macos"), not(target_os = "windows")))] +const OS_OVERHEAD: u64 = 1 * (1 << 30); // 1 GiB (Linux / WSL2) + +/// GPU runtime overhead: CUDA/Metal framework + compute buffer pool. +/// +/// CUDA: ~0.5 GiB driver + context overhead on Ampere+. +/// Metal: ~1 GiB. +#[cfg(target_os = "macos")] +const GPU_RUNTIME_OVERHEAD: u64 = 1 << 30; // 1 GiB (Metal) +#[cfg(not(target_os = "macos"))] +const GPU_RUNTIME_OVERHEAD: u64 = 512 * (1 << 20); // 512 MiB (CUDA) const SYNC_OVERHEAD_PER_LAYER_US: f64 = 50.0; // 50μs CPU-GPU sync per layer const MOE_CACHE_HIT_RATE: f64 = 0.965; // From PowerInfer-2, matches estimator.rs diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1d3da8b..177c758 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1d3da8b8a80ecc784cc27effb3e6f37a2062f1ae +Subproject commit 177c75852a6a6fd6810ce23f50c77e4a60ea6828