diff --git a/InferenceWeb.Tests/BackendCatalogTests.cs b/InferenceWeb.Tests/BackendCatalogTests.cs
index 401c8d0..d646838 100644
--- a/InferenceWeb.Tests/BackendCatalogTests.cs
+++ b/InferenceWeb.Tests/BackendCatalogTests.cs
@@ -1,6 +1,4 @@
-using InferenceEngine;
-using InferenceWeb;
-using TensorSharp.GGML;
+﻿using TensorSharp.GGML;
 
 namespace InferenceWeb.Tests;
 
@@ -121,3 +119,5 @@ public void ShouldStoreWeightQuantized_GgmlBackendsKeepQuantizedWeights()
         Assert.True(shouldStoreQuantized);
     }
 }
+
+
diff --git a/InferenceWeb.Tests/GlobalUsings.cs b/InferenceWeb.Tests/GlobalUsings.cs
new file mode 100644
index 0000000..1f7dd10
--- /dev/null
+++ b/InferenceWeb.Tests/GlobalUsings.cs
@@ -0,0 +1,3 @@
+global using TensorSharp.Models;
+global using TensorSharp.Runtime;
+global using TensorSharp.Server;
diff --git a/InferenceWeb.Tests/ImageProcessorTests.cs b/InferenceWeb.Tests/ImageProcessorTests.cs
index 92dd8ca..d82af18 100644
--- a/InferenceWeb.Tests/ImageProcessorTests.cs
+++ b/InferenceWeb.Tests/ImageProcessorTests.cs
@@ -1,5 +1,4 @@
-using InferenceEngine;
-
+﻿
 namespace InferenceWeb.Tests;
 
 public class ImageProcessorTests
@@ -95,3 +94,4 @@ private static string WriteEmbeddedJpeg()
         return path;
     }
 }
+
diff --git a/InferenceWeb.Tests/InferenceWeb.Tests.csproj b/InferenceWeb.Tests/InferenceWeb.Tests.csproj
index ca583c8..90960bd 100644
--- a/InferenceWeb.Tests/InferenceWeb.Tests.csproj
+++ b/InferenceWeb.Tests/InferenceWeb.Tests.csproj
@@ -15,7 +15,8 @@
     <Using Include="Xunit" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="..\InferenceEngine\InferenceEngine.csproj" />
-    <ProjectReference Include="..\InferenceWeb\InferenceWeb.csproj" />
+    <ProjectReference Include="..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
+    <ProjectReference Include="..\TensorSharp.Models\TensorSharp.Models.csproj" />
+    <ProjectReference Include="..\TensorSharp.Server\TensorSharp.Server.csproj" />
   </ItemGroup>
 </Project>
diff --git a/InferenceWeb.Tests/KVCacheTests.cs b/InferenceWeb.Tests/KVCacheTests.cs
index c33c2cd..7bb4eea 100644
--- a/InferenceWeb.Tests/KVCacheTests.cs
+++ b/InferenceWeb.Tests/KVCacheTests.cs
@@ -1,5 +1,4 @@
-using InferenceWeb;
-
+﻿
 namespace InferenceWeb.Tests;
 
 public class KVCacheTests
@@ -129,3 +128,4 @@ public void FindTokenPrefixLength_ThinkingModelWithContentInContext()
         Assert.Equal(8, common); // Full cached is prefix
     }
 }
+
diff --git a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
index ffc8a6a..64e34ab 100644
--- a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
+++ b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
@@ -1,5 +1,4 @@
-using System.Buffers.Binary;
-using InferenceEngine;
+﻿using System.Buffers.Binary;
 
 namespace InferenceWeb.Tests;
 
@@ -168,3 +167,4 @@ private static float Dot(float[] lhs, float[] rhs, int rhsOffset, int length)
         return sum;
     }
 }
+
diff --git a/InferenceWeb.Tests/MediaHelperTests.cs b/InferenceWeb.Tests/MediaHelperTests.cs
index 4be19b3..c2263a0 100644
--- a/InferenceWeb.Tests/MediaHelperTests.cs
+++ b/InferenceWeb.Tests/MediaHelperTests.cs
@@ -1,5 +1,4 @@
-using InferenceEngine;
-
+﻿
 namespace InferenceWeb.Tests;
 
 public class MediaHelperTests
@@ -71,3 +70,4 @@ public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride()
         }
     }
 }
+
diff --git a/InferenceWeb.Tests/ModelServiceHistoryTests.cs b/InferenceWeb.Tests/ModelServiceHistoryTests.cs
index 4a01b1e..09d4bb8 100644
--- a/InferenceWeb.Tests/ModelServiceHistoryTests.cs
+++ b/InferenceWeb.Tests/ModelServiceHistoryTests.cs
@@ -1,6 +1,4 @@
-using InferenceEngine;
-using InferenceWeb;
-
+﻿
 namespace InferenceWeb.Tests;
 
 public class ModelServiceHistoryTests
@@ -70,3 +68,5 @@ public void PrepareHistoryForInference_NormalizesEarlierVideoTurns()
         }
     }
 }
+
+
diff --git a/InferenceWeb.Tests/StructuredOutputTests.cs b/InferenceWeb.Tests/StructuredOutputTests.cs
index 658b4ec..3af4bce 100644
--- a/InferenceWeb.Tests/StructuredOutputTests.cs
+++ b/InferenceWeb.Tests/StructuredOutputTests.cs
@@ -1,6 +1,4 @@
-using System.Text.Json;
-using InferenceEngine;
-using InferenceWeb;
+﻿using System.Text.Json;
 
 namespace InferenceWeb.Tests;
 
@@ -206,3 +204,5 @@ public void JsonSchemaNormalizationSupportsDefsAndAnyOf()
         Assert.Equal("""{"item":{"name":"Ada","age":30}}""", normalized.NormalizedContent);
     }
 }
+
+
diff --git a/InferenceWeb.Tests/WebUiChatPolicyTests.cs b/InferenceWeb.Tests/WebUiChatPolicyTests.cs
index eaebe36..0ba2a5c 100644
--- a/InferenceWeb.Tests/WebUiChatPolicyTests.cs
+++ b/InferenceWeb.Tests/WebUiChatPolicyTests.cs
@@ -1,5 +1,4 @@
-using InferenceWeb;
-
+﻿
 namespace InferenceWeb.Tests;
 
 public class WebUiChatPolicyTests
@@ -31,3 +30,4 @@ public void TryValidateChatRequest_RejectsPerTurnBackendSelection()
         Assert.Equal(WebUiChatPolicy.ModelSelectionLockedMessage, error);
     }
 }
+
diff --git a/README.md b/README.md
index 8e05731..742ebf8 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# TensorSharp
+﻿# TensorSharp
 
 <p align="center">
   <img src="imgs/banner_1.png" alt="TensorSharp logo" width="320">
@@ -10,8 +10,8 @@ A C# inference engine for running large language models (LLMs) locally using GGU
 
 ## Features
 
-- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H
-- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5
+- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H, Mistral 3
+- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5 / Mistral 3
 - **Thinking / reasoning mode** -- structured chain-of-thought output with `<think>` / `<|channel>thought` / `<|channel>analysis` tags (Qwen 3, Qwen 3.5, Gemma 4, GPT OSS, Nemotron-H)
 - **Tool calling / function calling** -- models can invoke user-defined tools; multi-turn tool-call conversations supported across all three API styles
 - **Quantized model support** -- loads GGUF files with Q4_K_M, Q8_0, F16, MXFP4, and other quantization formats; performs native quantized matmul without dequantizing to FP32, including memory-efficient pure C# CPU loading for large GGUFs
@@ -38,6 +38,7 @@ A C# inference engine for running large language models (LLMs) locally using GGU
 | Qwen 3.5 | Qwen3.5-9B, Qwen3.5-35B-A3B | Image | Yes | Yes |
 | GPT OSS | gpt-oss-20b (MoE) | Text only | Yes | No |
 | Nemotron-H | Nemotron-H-8B, Nemotron-H-47B (Hybrid SSM-Transformer, MoE) | Text only | Yes | Yes |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | Image | No | No |
 
 See [Model Architecture Cards](docs/model_cards.md) for detailed documentation of each architecture.
 
@@ -58,6 +59,8 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
 | GPT OSS | gpt-oss-20b (MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
 | Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
 | Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+| Mistral 3 | mistral3-mmproj (Pixtral vision projector) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
 
 ## Compute Backends
 
@@ -72,36 +75,37 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
 
 ```
 TensorSharp/
-├── TensorSharp/                 # Core tensor library (CPU operations, SIMD)
-├── TensorSharp.GGML/            # GGML backend bindings (Metal/CUDA/CPU via native library)
+├── TensorSharp.Core/            # Core tensor library (Tensor, Ops, memory, device abstraction)
+├── TensorSharp.Runtime/         # GGUF, tokenizers, templates, sampling, protocol parsing
+├── TensorSharp.Models/          # Model architectures and multimodal encoders/injectors
+├── TensorSharp.Backends.GGML/   # GGML backend bindings (Metal/CUDA/CPU via native library)
 ├── TensorSharp.GGML.Native/     # Native C++ bridge to ggml (builds libGgmlOps)
-├── AdvUtils/                    # Utility library
-├── InferenceEngine/             # Model loading, tokenization, and inference logic
-│   ├── Models/
-│   │   ├── Gemma3/
-│   │   ├── Gemma4/              # Vision encoder, audio encoder, MoE, fused GPU decode
-│   │   ├── GptOss/              # MoE, attention sinks, SiLUAlphaLimit, Yarn RoPE
-│   │   ├── Nemotron/            # Hybrid Mamba2 SSM + attention + MoE FFN
-│   │   ├── Qwen3/
-│   │   └── Qwen35/
-│   ├── GgufReader.cs            # GGUF file parser
-│   ├── ModelBase.cs             # Base class for all model architectures
-│   ├── ChatTemplate.cs          # Chat template rendering (hardcoded + Jinja2 from GGUF)
-│   ├── Jinja2Template.cs        # Jinja2 template renderer
-│   ├── OutputParser.cs          # Extracts thinking, content, and tool calls from model output
-│   ├── SamplingConfig.cs        # Sampling parameter configuration
-│   ├── TokenSampler.cs          # Token sampling (greedy, top-k, top-p, min-p, penalties)
-│   └── MediaHelper.cs           # Video frame extraction, audio decoding
-├── InferenceConsole/            # CLI application
-├── InferenceWeb/                # Web chatbot + API server (ASP.NET Core)
+├── TensorSharp.Server/          # Web chatbot + API server (ASP.NET Core)
 │   ├── ModelService.cs          # Model lifecycle management
 │   ├── InferenceQueue.cs        # FIFO request queue with position tracking
 │   ├── wwwroot/index.html       # Chat UI
 │   ├── testdata/                # Integration test suites (bash + Python)
 │   └── API_EXAMPLES.md          # Detailed API documentation
+├── TensorSharp.Cli/             # CLI application
+├── AdvUtils/                    # Utility library
 └── ExternalProjects/            # Third-party dependencies (ggml)
 ```
 
+## NuGet Packages
+
+The repository is now split along package boundaries so consumers can depend on only the layers they actually need.
+
+| Project | NuGet package | Public namespace | Responsibility |
+|---|---|---|---|
+| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor primitives, ops, allocators, storage, and device abstraction |
+| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF parsing, tokenizers, prompt rendering, sampling, and output protocol parsing |
+| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`, architecture implementations, multimodal encoders, and model-side execution helpers |
+| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML-backed execution and native interop |
+| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core server, OpenAI/Ollama adapters, queueing, and web UI |
+| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | Console host and debugging / batch tooling |
+
+This split keeps engine users off the web stack, keeps API-layer changes from leaking into core/runtime packages, and makes future benchmark or eval-harness projects easier to publish independently.
+
 ## Prerequisites
 
 - [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
@@ -121,10 +125,10 @@ dotnet build TensorSharp.slnx
 
 ```bash
 # Console application
-dotnet build InferenceConsole/InferenceConsole.csproj
+dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj
 
 # Web application
-dotnet build InferenceWeb/InferenceWeb.csproj
+dotnet build TensorSharp.Server/TensorSharp.Server.csproj
 ```
 
 ### Build the native GGML library
@@ -166,7 +170,7 @@ TENSORSHARP_GGML_NATIVE_BUILD_PARALLEL_LEVEL=2 bash build-linux.sh --cuda
 You can also request a CUDA-enabled native build from `dotnet build`:
 
 ```bash
-TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
+TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release
 ```
 
 On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory.
@@ -176,38 +180,38 @@ On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `bui
 ### Console Application
 
 ```bash
-cd InferenceConsole/bin
+cd TensorSharp.Cli/bin
 
 # Text inference
-./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
     --max-tokens 200 --backend ggml_metal
 
 # Text inference on Linux + NVIDIA GPU
-./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
     --max-tokens 200 --backend ggml_cuda
 
 # Image inference (Gemma 3/4, Qwen 3.5)
-./InferenceConsole --model <model.gguf> --image photo.png --backend ggml_metal
+./TensorSharp.Cli --model <model.gguf> --image photo.png --backend ggml_metal
 
 # Video inference (Gemma 4)
-./InferenceConsole --model <model.gguf> --video clip.mp4 --backend ggml_metal
+./TensorSharp.Cli --model <model.gguf> --video clip.mp4 --backend ggml_metal
 
 # Audio inference (Gemma 4)
-./InferenceConsole --model <model.gguf> --audio speech.wav --backend ggml_metal
+./TensorSharp.Cli --model <model.gguf> --audio speech.wav --backend ggml_metal
 
 # Thinking / reasoning mode
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal --think
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal --think
 
 # Tool calling
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
     --tools tools.json
 
 # With sampling parameters
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
     --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
 
 # Batch processing (JSONL)
-./InferenceConsole --model <model.gguf> --input-jsonl requests.jsonl \
+./TensorSharp.Cli --model <model.gguf> --input-jsonl requests.jsonl \
     --output results.txt --backend ggml_metal
 ```
 
@@ -253,13 +257,13 @@ Each line is a JSON object with `messages`, optional `prompt`, and optional samp
 ### Web Application
 
 ```bash
-cd InferenceWeb/bin
+cd TensorSharp.Server/bin
 
 # Set environment variables and run
-MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb
+MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server
 
 # Linux + NVIDIA GPU
-MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
+MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server
 ```
 
 Open `http://localhost:5000` in your browser. The web interface supports:
@@ -284,7 +288,7 @@ Open `http://localhost:5000` in your browser. The web interface supports:
 
 ### HTTP APIs
 
-InferenceWeb exposes three API styles. See [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md) for full documentation with curl and Python examples.
+TensorSharp.Server exposes three API styles. See [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md) for full documentation with curl and Python examples.
 
 **Ollama-compatible API:**
 
@@ -403,17 +407,27 @@ Gemma 4 models support image, video, and audio inputs. Place the multimodal proj
 
 These models support image inputs with their respective multimodal projector files.
 
+### Mistral 3
+
+Mistral 3 supports image inputs via the Pixtral vision encoder. Place the multimodal projector (`mistral3-mmproj.gguf`) in the same directory as the model file for automatic loading.
+
+- **Images:** PNG, JPEG
+
 ## Architecture
 
 TensorSharp is structured as a layered system:
 
-1. **TensorSharp** provides the core `Tensor` type, storage abstraction, and an extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
+1. **TensorSharp.Core** provides the core `Tensor` type, storage abstraction, and the extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
 
-2. **TensorSharp.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
+2. **TensorSharp.Runtime** owns runtime-facing contracts and services: GGUF parsing, tokenization (SentencePiece / BPE), chat template rendering, configurable token sampling, output parsing, and reusable contracts such as `IModelArchitecture`, `IPromptRenderer`, `IOutputProtocolParser`, `IMultimodalInjector`, `IKVCachePolicy`, and `IBackendExecutionPlan`.
 
-3. **InferenceEngine** implements model-specific logic: GGUF parsing, tokenization (SentencePiece BPE), chat template rendering (Jinja2 from GGUF metadata with hardcoded fallbacks), configurable token sampling, output parsing (thinking extraction, tool-call extraction), and the forward pass for each architecture (including hybrid SSM-Transformer models like Nemotron-H with Mamba2 layers). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
+3. **TensorSharp.Models** implements `ModelBase` plus the concrete architectures and multimodal helpers (Gemma 3/4, Qwen 3/3.5, GPT OSS, Nemotron-H, Mistral 3). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
 
-4. **InferenceConsole** and **InferenceWeb** are application layers that handle I/O and user interaction. InferenceWeb provides Ollama-compatible and OpenAI-compatible REST APIs alongside a browser-based chat UI, with a FIFO inference queue to serialize concurrent requests.
+4. **TensorSharp.Backends.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
+
+5. **TensorSharp.Server** is the HTTP/application layer. It provides Ollama-compatible and OpenAI-compatible REST APIs, the browser-based chat UI, upload handling, and the FIFO inference queue.
+
+6. **TensorSharp.Cli** is the console/application layer for local prompts, multimodal experiments, prompt inspection, and JSONL batch workflows.
 
 ### Performance Optimizations
 
@@ -426,16 +440,16 @@ TensorSharp is structured as a layered system:
 
 ## Testing
 
-Integration tests for InferenceWeb are in `InferenceWeb/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
+Integration tests for TensorSharp.Server are in `TensorSharp.Server/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
 
 ```bash
-# Start InferenceWeb, then run:
-python3 InferenceWeb/testdata/test_multiturn.py
+# Start TensorSharp.Server, then run:
+python3 TensorSharp.Server/testdata/test_multiturn.py
 # or
-bash InferenceWeb/testdata/test_multiturn.sh
+bash TensorSharp.Server/testdata/test_multiturn.sh
 ```
 
-See [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md) for the full test matrix.
+See [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md) for the full test matrix.
 
 ## Author
 
@@ -444,3 +458,4 @@ Zhongkai Fu
 ## License
 
 See [LICENSE](LICENSE) for details.
+
diff --git a/README_zh-cn.md b/README_zh-cn.md
index fdbf14e..9fb6ce8 100644
--- a/README_zh-cn.md
+++ b/README_zh-cn.md
@@ -1,411 +1,426 @@
-# TensorSharp
-
-<p align="center">
-  <img src="imgs/banner_1.png" alt="TensorSharp logo" width="320">
-</p>
-
-[English](README.md) | [中文](README_zh-cn.md)
-
-一个用于在本地运行大型语言模型（LLM）的 C# 推理引擎，使用 GGUF 模型文件。TensorSharp 提供控制台应用、基于 Web 的聊天界面，以及兼容 Ollama/OpenAI 的 HTTP API 以便程序化调用。
-
-## 功能特性
-
-- **多架构支持** —— Gemma 4、Gemma 3、Qwen 3、Qwen 3.5、GPT OSS、Nemotron-H
-- **多模态推理** —— 图像、视频和音频输入（Gemma 4）；图像输入（Gemma 3 / Qwen 3.5）
-- **思维链 / 推理模式** —— 通过 `<think>` / `<|channel>thought` / `<|channel>analysis` 标签输出结构化的思维链推理（Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H）
-- **工具调用 / 函数调用** —— 模型可调用用户定义的工具；所有三种 API 风格均支持多轮工具调用对话
-- **量化模型支持** —— 加载 Q4_K_M、Q8_0、F16、MXFP4 等量化格式的 GGUF 文件；执行原生量化矩阵乘法（matmul），无需反量化到 FP32，并且纯 C# CPU 后端在加载大型 GGUF 时也会保持量化权重压缩状态
-- **GPU 加速** —— 通过 GGML 支持 Apple Metal（macOS）和 GGML CUDA（Linux/NVIDIA）；Gemma 4 在 Metal 上支持整模型融合 GPU decode（相对逐算子调度约提升 2.6 倍）
-- **优化后的纯 C# CPU 后端** —— 为 GEMM、RMSNorm、RoPE、softmax、融合激活等推理热点路径提供托管快速路径和 SIMD 内核
-- **兼容 Ollama 与 OpenAI API** —— 可作为现有工具链的即插即用替代端点
-- **可配置采样** —— temperature、top-k、top-p、min-p、重复/存在/频率惩罚、seed、停止序列
-- **聊天模板** —— 从 GGUF 元数据自动加载（Jinja2），并为不同架构提供硬编码回退模板
-- **请求队列** —— FIFO 推理队列确保单请求执行以保障 KV 缓存稳定性，并为客户端提供实时排队位置反馈
-- **批处理** —— 控制台应用支持 JSONL 输入
-- **流式输出** —— 按 token 输出（Web 通过 SSE，控制台通过 stdout）
-- **混合 SSM-Transformer** —— Nemotron-H 在单个模型中混合 Mamba2 SSM 层、纯注意力层和 MoE FFN 层
-- **专家混合（MoE）** —— 支持 Gemma 4 MoE 变体（例如 gemma-4-26B-A4B）、GPT OSS MoE（例如 gpt-oss-20b）、Nemotron-H MoE FFN 层
-- **消息编辑** —— 在 Web 聊天界面中编辑或删除历史消息，并从该位置重新生成回复
-- **大文件上传** —— Web 界面支持最大 500 MB 的视频/音频上传
-
-## 支持的模型架构
-
-| 架构 | 示例模型 | 多模态 | 思维链 | 工具调用 |
-|---|---|---|---|---|
-| Gemma 4 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B（MoE） | 图像、视频、音频 | 支持 | 支持 |
-| Gemma 3 | gemma-3-4b | 图像 | 不支持 | 不支持 |
-| Qwen 3 | Qwen3-4B | 仅文本 | 支持 | 支持 |
-| Qwen 3.5 | Qwen3.5-9B、Qwen3.5-35B-A3B | 图像 | 支持 | 支持 |
-| GPT OSS | gpt-oss-20b（MoE） | 仅文本 | 支持 | 不支持 |
-| Nemotron-H | Nemotron-H-8B、Nemotron-H-47B（混合 SSM-Transformer，MoE） | 仅文本 | 支持 | 支持 |
-
-各架构的详细文档见[模型架构卡片](docs/model_cards_cn.md)。
-
-## 模型下载（GGUF）
-
-TensorSharp 使用 GGUF 格式模型文件。以下是各架构对应的 Hugging Face 下载链接。请根据硬件条件选择合适的量化版本（Q4_K_M 适合低内存，Q8_0 适合更高质量等）。
-
-| 架构 | 模型 | GGUF 下载 |
-|---|---|---|
-| Gemma 4 | gemma-4-E4B-it | [ggml-org/gemma-4-E4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF) |
-| Gemma 4 | gemma-4-31B-it | [ggml-org/gemma-4-31B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-31B-it-GGUF) |
-| Gemma 4 | gemma-4-26B-A4B-it（MoE） | [ggml-org/gemma-4-26B-A4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-26B-A4B-it-GGUF) |
-| Gemma 4 | gemma-4-mmproj（多模态投影器） | 包含在上述 GGUF 仓库中 |
-| Gemma 3 | gemma-3-4b-it | [google/gemma-3-4b-it-qat-q4_0-gguf](https://huggingface.co/google/gemma-3-4b-it-qat-q4_0-gguf) |
-| Qwen 3 | Qwen3-4B | [Qwen/Qwen3-4B-GGUF](https://huggingface.co/Qwen/Qwen3-4B-GGUF) |
-| Qwen 3.5 | Qwen3.5-9B | [unsloth/Qwen3.5-9B-GGUF](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) |
-| Qwen 3.5 | Qwen3.5-35B-A3B | [ggml-org/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/ggml-org/Qwen3.5-35B-A3B-GGUF) |
-| GPT OSS | gpt-oss-20b（MoE） | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
-| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
-| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
-
-## 计算后端
-
-| 后端 | 参数 | 说明 |
-|---|---|---|
-| GGML Metal | `--backend ggml_metal` | 通过 Apple Metal（macOS）进行 GPU 加速。推荐用于 Apple Silicon。 |
-| GGML CUDA | `--backend ggml_cuda` | 通过 GGML CUDA 在 Linux + NVIDIA GPU 上进行加速。 |
-| GGML CPU | `--backend ggml_cpu` | 使用原生 GGML 与优化内核进行 CPU 推理。 |
-| 纯 C# CPU | `--backend cpu` | 无原生依赖的可移植 CPU 推理。 |
-
-## 项目结构
-
-```text
-TensorSharp/
-├── TensorSharp/                 # 核心张量库（CPU 运算、SIMD）
-├── TensorSharp.GGML/            # GGML 后端绑定（通过原生库支持 Metal/CUDA/CPU）
-├── TensorSharp.GGML.Native/     # 到 ggml 的原生 C++ 桥接（构建 libGgmlOps）
-├── AdvUtils/                    # 工具库
-├── InferenceEngine/             # 模型加载、分词和推理逻辑
-│   ├── Models/
-│   │   ├── Gemma3/
-│   │   ├── Gemma4/              # 视觉编码器、音频编码器、MoE、融合 GPU decode
-│   │   ├── GptOss/              # MoE、注意力沉降、SiLUAlphaLimit、Yarn RoPE
-│   │   ├── Nemotron/            # 混合 Mamba2 SSM + 注意力 + MoE FFN
-│   │   ├── Qwen3/
-│   │   └── Qwen35/
-│   ├── GgufReader.cs            # GGUF 文件解析器
-│   ├── ModelBase.cs             # 各模型架构基类
-│   ├── ChatTemplate.cs          # 聊天模板渲染（硬编码 + 来自 GGUF 的 Jinja2）
-│   ├── Jinja2Template.cs        # Jinja2 模板渲染器
-│   ├── OutputParser.cs          # 从模型输出中提取思维链、内容和工具调用
-│   ├── SamplingConfig.cs        # 采样参数配置
-│   ├── TokenSampler.cs          # Token 采样（greedy、top-k、top-p、min-p、惩罚项）
-│   └── MediaHelper.cs           # 视频抽帧、音频解码
-├── InferenceConsole/            # CLI 应用
-├── InferenceWeb/                # Web 聊天 + API 服务（ASP.NET Core）
-│   ├── ModelService.cs          # 模型生命周期管理
-│   ├── InferenceQueue.cs        # 带排队位置追踪的 FIFO 请求队列
-│   ├── wwwroot/index.html       # 聊天界面
-│   ├── testdata/                # 集成测试套件（bash + Python）
-│   └── API_EXAMPLES.md          # 详细 API 文档
-└── ExternalProjects/            # 第三方依赖（ggml）
-```
-
-## 前置要求
-
-- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
-- **macOS（Metal 后端）：** 用于构建原生 GGML 库的 CMake 3.20+ 与 Xcode 命令行工具
-- **Linux（GGML CPU / CUDA 后端）：** CMake 3.20+；若使用 `ggml_cuda`，还需要 NVIDIA 驱动和 CUDA Toolkit 12.x 或其他兼容版本
-- GGUF 模型文件（例如来自 [Hugging Face](https://huggingface.co)）
-
-## 构建
-
-### 构建整个解决方案
-
-```bash
-dotnet build TensorSharp.slnx
-```
-
-### 构建单独应用
-
-```bash
-# 控制台应用
-dotnet build InferenceConsole/InferenceConsole.csproj
-
-# Web 应用
-dotnet build InferenceWeb/InferenceWeb.csproj
-```
-
-### 构建原生 GGML 库
-
-如果原生库不存在，首次执行 `dotnet build` 时会自动构建。也可以手动构建：
-
-```bash
-cd TensorSharp.GGML.Native
-```
-
-macOS：
-
-```bash
-bash build-macos.sh
-```
-
-Linux（仅 CPU）：
-
-```bash
-bash build-linux.sh
-```
-
-Linux（启用 GGML_CUDA）：
-
-```bash
-bash build-linux.sh --cuda
-```
-
-也可以在 `dotnet build` 时通过环境变量请求 CUDA 版本的原生库：
-
-```bash
-TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
-```
-
-在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上，`build-linux.sh` 会保留已有的 CUDA 构建，并在检测到 CUDA 工具链时自动启用 GGML_CUDA；也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。
-
-## 使用方法
-
-### 控制台应用
-
-```bash
-cd InferenceConsole/bin
-
-# 文本推理
-./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
-    --max-tokens 200 --backend ggml_metal
-
-# Linux + NVIDIA GPU 文本推理
-./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
-    --max-tokens 200 --backend ggml_cuda
-
-# 图像推理（Gemma 3/4，Qwen 3.5）
-./InferenceConsole --model <model.gguf> --image photo.png --backend ggml_metal
-
-# 视频推理（Gemma 4）
-./InferenceConsole --model <model.gguf> --video clip.mp4 --backend ggml_metal
-
-# 音频推理（Gemma 4）
-./InferenceConsole --model <model.gguf> --audio speech.wav --backend ggml_metal
-
-# 思维链 / 推理模式
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal --think
-
-# 工具调用
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
-    --tools tools.json
-
-# 使用采样参数
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
-    --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
-
-# 批处理（JSONL）
-./InferenceConsole --model <model.gguf> --input-jsonl requests.jsonl \
-    --output results.txt --backend ggml_metal
-```
-
-**命令行参数：**
-
-| 参数 | 说明 |
-|---|---|
-| `--model <path>` | GGUF 模型文件路径（必填） |
-| `--input <path>` | 包含用户提示词的文本文件 |
-| `--input-jsonl <path>` | JSONL 批量请求文件（每行一个 JSON） |
-| `--multi-turn-jsonl <path>` | 用于多轮对话模拟（含 KV 缓存复用）的 JSONL 文件 |
-| `--output <path>` | 将生成文本写入该文件 |
-| `--image <path>` | 用于视觉推理的图像文件 |
-| `--video <path>` | 用于视频推理的视频文件 |
-| `--audio <path>` | 音频文件（WAV、MP3、OGG）用于音频推理 |
-| `--mmproj <path>` | 多模态投影器 GGUF 文件路径 |
-| `--max-tokens <N>` | 最大生成 token 数（默认：100） |
-| `--backend <type>` | 计算后端：`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda` |
-| `--think` | 启用思维链/推理模式 |
-| `--tools <path>` | 包含工具/函数定义的 JSON 文件 |
-| `--temperature <f>` | 采样温度（0 = 贪心） |
-| `--top-k <N>` | Top-K 过滤（0 = 关闭） |
-| `--top-p <f>` | Nucleus 采样阈值（1.0 = 关闭） |
-| `--min-p <f>` | 最小概率过滤（0 = 关闭） |
-| `--repeat-penalty <f>` | 重复惩罚（1.0 = 无） |
-| `--presence-penalty <f>` | 存在惩罚（0 = 关闭） |
-| `--frequency-penalty <f>` | 频率惩罚（0 = 关闭） |
-| `--seed <N>` | 随机种子（-1 = 非确定性） |
-| `--stop <string>` | 停止序列（可重复指定） |
-| `--test` | 运行内置测试套件 |
-
-如果把多模态投影器文件放在模型文件同目录并使用可识别命名（例如 `gemma-4-mmproj-F16.gguf`），系统会自动检测。
-
-**JSONL 输入格式：**
-
-每行是一个 JSON 对象，包含 `messages`、可选 `prompt` 和可选采样参数：
-
-```json
-{"id": "q1", "messages": [{"role": "user", "content": "What is 2+3?"}], "max_tokens": 50}
-{"id": "q2", "messages": [{"role": "user", "content": "Write a haiku."}], "max_tokens": 100, "temperature": 0.8}
-```
-
-### Web 应用
-
-```bash
-cd InferenceWeb/bin
-
-# 设置环境变量并运行
-MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb
-
-# Linux + NVIDIA GPU
-MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
-```
-
-在浏览器中打开 `http://localhost:5000`。Web 界面支持：
-
-- 多轮聊天
-- 从 `MODEL_DIR` 中可用 GGUF 文件列表选择模型
-- 上传图像、视频和音频进行多模态推理（最大 500 MB）
-- 思维链/推理模式切换
-- 带函数定义的工具调用
-- 通过 Server-Sent Events 进行流式 token 生成
-- 带实时排队位置反馈的请求队列
-- 消息编辑和删除，支持从对话中任意位置重新生成
-
-**环境变量：**
-
-| 变量 | 说明 |
-|---|---|
-| `MODEL_DIR` | GGUF 模型文件所在目录 |
-| `BACKEND` | 计算后端：`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`（默认：macOS 为 `ggml_metal`，其他平台为 `ggml_cpu`） |
-| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限（默认：`4`） |
-| `PORT` | HTTP 端口（默认：`5000`） |
-
-### HTTP API
-
-InferenceWeb 暴露三种 API 风格。完整文档及 curl/Python 示例见 [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md)。
-
-**兼容 Ollama 的 API：**
-
-```bash
-# 列出模型
-curl http://localhost:5000/api/tags
-
-# 文本生成
-curl -X POST http://localhost:5000/api/generate \
-  -H "Content-Type: application/json" \
-  -d '{"model": "Qwen3-4B-Q8_0.gguf", "prompt": "Hello!", "stream": false}'
-
-# 聊天
-curl -X POST http://localhost:5000/api/chat/ollama \
-  -H "Content-Type: application/json" \
-  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "stream": false}'
-
-# 启用思维链模式的聊天
-curl -X POST http://localhost:5000/api/chat/ollama \
-  -H "Content-Type: application/json" \
-  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "计算 17*23"}], "think": true, "stream": false}'
-
-# 带工具调用的聊天
-curl -X POST http://localhost:5000/api/chat/ollama \
-  -H "Content-Type: application/json" \
-  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "天气怎么样？"}], "tools": [{"function": {"name": "get_weather", "description": "获取当前天气", "parameters": {"properties": {"city": {"type": "string"}}, "required": ["city"]}}}], "stream": false}'
-```
-
-**兼容 OpenAI 的 API：**
-
-```bash
-# Chat completions
-curl -X POST http://localhost:5000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 50}'
-```
-
-**OpenAI Python SDK：**
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:5000/v1", api_key="not-needed")
-response = client.chat.completions.create(
-    model="Qwen3-4B-Q8_0.gguf",
-    messages=[{"role": "user", "content": "What is 2+3?"}],
-    max_tokens=50
-)
-print(response.choices[0].message.content)
-```
-
-**队列状态：**
-
-```bash
-curl http://localhost:5000/api/queue/status
-# {"busy":false,"pending_requests":0,"total_processed":42}
-```
-
-## 思维链 / 推理模式
-
-支持思维链模式的模型（Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H）可以在生成最终答案之前产出结构化的思维链推理内容。思维内容与主要回复分开，客户端可选择显示或隐藏。
-
-- **Qwen 3 / Qwen 3.5 / Nemotron-H：** 使用 `<think>...</think>` 标签
-- **Gemma 4：** 使用 `<|channel>thought\n...<channel|>` 标签
-- **GPT OSS：** 使用 Harmony 格式，以 `<|channel|>analysis` 标记思维过程，以 `<|channel|>final` 标记最终回复
-
-通过 `--think`（控制台）、`"think": true`（Ollama API）或 Web 界面中的思维链开关启用。
-
-## 工具调用 / 函数调用
-
-模型可以调用用户定义的工具并参与多轮工具调用对话。将工具定义为 JSON 格式，通过 `--tools`（控制台）或 API 中的 `tools` 参数传入。
-
-各架构使用各自的工具调用格式：
-
-- **Qwen 3 / Qwen 3.5 / Nemotron-H：** `<tool_call>{"name": "...", "arguments": {...}}</tool_call>`
-- **Gemma 4：** `<|tool_call>call:function_name{args}<tool_call|>`
-
-输出解析器（`OutputParser.cs`）会自动从模型原始输出中提取工具调用，与架构无关。
-
-## 多模态支持
-
-### Gemma 4
-
-Gemma 4 模型支持图像、视频和音频输入。将多模态投影器（`gemma-4-mmproj-F16.gguf`）放在与模型文件相同目录即可自动加载。
-
-- **图像：** PNG、JPEG
-- **视频：** MP4（使用 OpenCV 以 1 fps 抽取最多 8 帧）
-- **音频：** WAV（16kHz 单声道）、MP3、OGG Vorbis
-
-### Gemma 3 / Qwen 3.5
-
-这两类模型支持图像输入，并需要对应的多模态投影器文件。
-
-## 架构说明
-
-TensorSharp 采用分层系统结构：
-
-1. **TensorSharp** 提供核心 `Tensor` 类型、存储抽象和可扩展的操作注册表（`Ops`）。CPU 实现使用 `System.Numerics.Vectors` 进行 SIMD 加速。
-
-2. **TensorSharp.GGML** 通过原生 C++ 桥接库（`libGgmlOps`）注册同名操作的加速实现，并链接 [ggml](https://github.com/ggml-org/ggml)。在 macOS 上可提供 Metal GPU 计算，在 Linux 上可启用面向 NVIDIA GPU 的 GGML CUDA。操作包括原生量化 matmul（Q4_K_M、Q8_0 等），无需反量化到 FP32。
-
-3. **InferenceEngine** 实现模型相关逻辑：GGUF 解析、分词（SentencePiece BPE）、聊天模板渲染（来自 GGUF 元数据的 Jinja2 + 硬编码回退）、可配置 token 采样、输出解析（思维链提取、工具调用提取），以及各架构前向计算（包括 Nemotron-H 等混合 SSM-Transformer 模型的 Mamba2 层）。模型通过 `ModelBase.Create()` 加载，并依据 GGUF 元数据自动识别架构。
-
-4. **InferenceConsole** 与 **InferenceWeb** 是应用层，负责 I/O 和用户交互。InferenceWeb 同时提供兼容 Ollama 与 OpenAI 的 REST API 以及浏览器聊天 UI，并使用 FIFO 推理队列来串行化并发请求。
-
-### 性能优化
-
-- **融合 GPU decode**（Gemma 4）：在 Metal 上将所有 Transformer 层合并为单次 GGML 计算图调度，将每个 token 的 CPU-GPU 往返从数百次降低到一次。相较逐算子调度约提升 2.6 倍。
-- **融合权重投影**：Q/K/V 投影融合为单次 QKV matmul；gate 与 up 投影融合为单次 gate_up matmul。
-- **原生量化计算**：量化权重（Q4_K_M、Q6_K、Q8_0 等）直接参与 matmul，无需展开为 FP32，节省内存与带宽。
-- **优化后的纯 C# CPU 路径**：托管 GEMM 快速路径和连续 Float32 内核加速了 decode、softmax、RMSNorm、RoPE、融合激活等热点路径，同时在 CPU 加载时保持量化 GGUF 权重压缩状态。
-- **环形 KV 缓存**：滑动窗口注意力层使用固定大小环形缓冲区，使内存占用不随序列长度增长。
-- **高内存效率模型加载**：大张量直接流式加载到原生内存，避免中间托管内存分配。
-
-## 测试
-
-InferenceWeb 的集成测试位于 `InferenceWeb/testdata/`。测试覆盖所有三种 API 风格（Web UI SSE、Ollama、OpenAI）、多轮对话、思维链模式、工具调用、队列行为、并发请求和中断支持。
-
-```bash
-# 先启动 InferenceWeb，然后运行：
-python3 InferenceWeb/testdata/test_multiturn.py
-# 或
-bash InferenceWeb/testdata/test_multiturn.sh
-```
-
-完整测试矩阵见 [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md)。
-
-## 作者
-
-Zhongkai Fu
-
-## 许可证
-
-详见 [LICENSE](LICENSE)。
+﻿# TensorSharp
+
+<p align="center">
+  <img src="imgs/banner_1.png" alt="TensorSharp logo" width="320">
+</p>
+
+[English](README.md) | [中文](README_zh-cn.md)
+
+一个用于在本地运行大型语言模型（LLM）的 C# 推理引擎，使用 GGUF 模型文件。TensorSharp 提供控制台应用、基于 Web 的聊天界面，以及兼容 Ollama/OpenAI 的 HTTP API 以便程序化调用。
+
+## 功能特性
+
+- **多架构支持** —— Gemma 4、Gemma 3、Qwen 3、Qwen 3.5、GPT OSS、Nemotron-H、Mistral 3
+- **多模态推理** —— 图像、视频和音频输入（Gemma 4）；图像输入（Gemma 3 / Qwen 3.5 / Mistral 3）
+- **思维链 / 推理模式** —— 通过 `<think>` / `<|channel>thought` / `<|channel>analysis` 标签输出结构化的思维链推理（Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H）
+- **工具调用 / 函数调用** —— 模型可调用用户定义的工具；所有三种 API 风格均支持多轮工具调用对话
+- **量化模型支持** —— 加载 Q4_K_M、Q8_0、F16、MXFP4 等量化格式的 GGUF 文件；执行原生量化矩阵乘法（matmul），无需反量化到 FP32，并且纯 C# CPU 后端在加载大型 GGUF 时也会保持量化权重压缩状态
+- **GPU 加速** —— 通过 GGML 支持 Apple Metal（macOS）和 GGML CUDA（Linux/NVIDIA）；Gemma 4 在 Metal 上支持整模型融合 GPU decode（相对逐算子调度约提升 2.6 倍）
+- **优化后的纯 C# CPU 后端** —— 为 GEMM、RMSNorm、RoPE、softmax、融合激活等推理热点路径提供托管快速路径和 SIMD 内核
+- **兼容 Ollama 与 OpenAI API** —— 可作为现有工具链的即插即用替代端点
+- **可配置采样** —— temperature、top-k、top-p、min-p、重复/存在/频率惩罚、seed、停止序列
+- **聊天模板** —— 从 GGUF 元数据自动加载（Jinja2），并为不同架构提供硬编码回退模板
+- **请求队列** —— FIFO 推理队列确保单请求执行以保障 KV 缓存稳定性，并为客户端提供实时排队位置反馈
+- **批处理** —— 控制台应用支持 JSONL 输入
+- **流式输出** —— 按 token 输出（Web 通过 SSE，控制台通过 stdout）
+- **混合 SSM-Transformer** —— Nemotron-H 在单个模型中混合 Mamba2 SSM 层、纯注意力层和 MoE FFN 层
+- **专家混合（MoE）** —— 支持 Gemma 4 MoE 变体（例如 gemma-4-26B-A4B）、GPT OSS MoE（例如 gpt-oss-20b）、Nemotron-H MoE FFN 层
+- **消息编辑** —— 在 Web 聊天界面中编辑或删除历史消息，并从该位置重新生成回复
+- **大文件上传** —— Web 界面支持最大 500 MB 的视频/音频上传
+
+## 支持的模型架构
+
+| 架构 | 示例模型 | 多模态 | 思维链 | 工具调用 |
+|---|---|---|---|---|
+| Gemma 4 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B（MoE） | 图像、视频、音频 | 支持 | 支持 |
+| Gemma 3 | gemma-3-4b | 图像 | 不支持 | 不支持 |
+| Qwen 3 | Qwen3-4B | 仅文本 | 支持 | 支持 |
+| Qwen 3.5 | Qwen3.5-9B、Qwen3.5-35B-A3B | 图像 | 支持 | 支持 |
+| GPT OSS | gpt-oss-20b（MoE） | 仅文本 | 支持 | 不支持 |
+| Nemotron-H | Nemotron-H-8B、Nemotron-H-47B（混合 SSM-Transformer，MoE） | 仅文本 | 支持 | 支持 |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | 图像 | 不支持 | 不支持 |
+
+各架构的详细文档见[模型架构卡片](docs/model_cards_cn.md)。
+
+## 模型下载（GGUF）
+
+TensorSharp 使用 GGUF 格式模型文件。以下是各架构对应的 Hugging Face 下载链接。请根据硬件条件选择合适的量化版本（Q4_K_M 适合低内存，Q8_0 适合更高质量等）。
+
+| 架构 | 模型 | GGUF 下载 |
+|---|---|---|
+| Gemma 4 | gemma-4-E4B-it | [ggml-org/gemma-4-E4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF) |
+| Gemma 4 | gemma-4-31B-it | [ggml-org/gemma-4-31B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-31B-it-GGUF) |
+| Gemma 4 | gemma-4-26B-A4B-it（MoE） | [ggml-org/gemma-4-26B-A4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-26B-A4B-it-GGUF) |
+| Gemma 4 | gemma-4-mmproj（多模态投影器） | 包含在上述 GGUF 仓库中 |
+| Gemma 3 | gemma-3-4b-it | [google/gemma-3-4b-it-qat-q4_0-gguf](https://huggingface.co/google/gemma-3-4b-it-qat-q4_0-gguf) |
+| Qwen 3 | Qwen3-4B | [Qwen/Qwen3-4B-GGUF](https://huggingface.co/Qwen/Qwen3-4B-GGUF) |
+| Qwen 3.5 | Qwen3.5-9B | [unsloth/Qwen3.5-9B-GGUF](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) |
+| Qwen 3.5 | Qwen3.5-35B-A3B | [ggml-org/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/ggml-org/Qwen3.5-35B-A3B-GGUF) |
+| GPT OSS | gpt-oss-20b（MoE） | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
+| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
+| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+| Mistral 3 | mistral3-mmproj（Pixtral 视觉投影器） | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+
+## 计算后端
+
+| 后端 | 参数 | 说明 |
+|---|---|---|
+| GGML Metal | `--backend ggml_metal` | 通过 Apple Metal（macOS）进行 GPU 加速。推荐用于 Apple Silicon。 |
+| GGML CUDA | `--backend ggml_cuda` | 通过 GGML CUDA 在 Linux + NVIDIA GPU 上进行加速。 |
+| GGML CPU | `--backend ggml_cpu` | 使用原生 GGML 与优化内核进行 CPU 推理。 |
+| 纯 C# CPU | `--backend cpu` | 无原生依赖的可移植 CPU 推理。 |
+
+## 项目结构
+
+```text
+TensorSharp/
+├── TensorSharp.Core/            # 核心张量库（Tensor、Ops、内存、设备抽象）
+├── TensorSharp.Runtime/         # GGUF、分词器、模板、采样、协议解析
+├── TensorSharp.Models/          # 模型架构实现与多模态编码/注入
+├── TensorSharp.Backends.GGML/   # GGML 后端绑定（通过原生库支持 Metal/CUDA/CPU）
+├── TensorSharp.GGML.Native/     # 到 ggml 的原生 C++ 桥接（构建 libGgmlOps）
+├── TensorSharp.Server/          # Web 聊天 + API 服务（ASP.NET Core）
+│   ├── ModelService.cs          # 模型生命周期管理
+│   ├── InferenceQueue.cs        # 带排队位置跟踪的 FIFO 请求队列
+│   ├── wwwroot/index.html       # 聊天界面
+│   ├── testdata/                # 集成测试套件（bash + Python）
+│   └── API_EXAMPLES.md          # 详细 API 文档
+├── TensorSharp.Cli/             # CLI 应用
+├── AdvUtils/                    # 工具库
+└── ExternalProjects/            # 第三方依赖（ggml）
+```
+
+## NuGet 包分层
+
+现在仓库按包边界拆成独立层，使用者可以只引用真正需要的部分。
+
+| 项目 | NuGet 包 | 对外 namespace | 职责 |
+|---|---|---|---|
+| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor 原语、Ops、分配器、存储与设备抽象 |
+| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF 解析、分词器、Prompt 渲染、采样与输出协议解析 |
+| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`、各模型架构、多模态编码器与模型侧执行辅助 |
+| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML 执行后端与原生互操作 |
+| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core 服务、OpenAI/Ollama 适配层、队列与 Web UI |
+| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | 控制台宿主、调试工具与 JSONL 批处理 |
+
+这样的拆分让引擎使用者不必带上 Web 依赖，也能把 API 层改动和核心运行时隔离开，并让后续 benchmark / eval harness 更容易独立发布。
+
+## 前置要求
+
+- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
+- **macOS（Metal 后端）：** 用于构建原生 GGML 库的 CMake 3.20+ 与 Xcode 命令行工具
+- **Linux（GGML CPU / CUDA 后端）：** CMake 3.20+；若使用 `ggml_cuda`，还需要 NVIDIA 驱动和 CUDA Toolkit 12.x 或其他兼容版本
+- GGUF 模型文件（例如来自 [Hugging Face](https://huggingface.co)）
+
+## 构建
+
+### 构建整个解决方案
+
+```bash
+dotnet build TensorSharp.slnx
+```
+
+### 构建单独应用
+
+```bash
+# 控制台应用
+dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj
+
+# Web 应用
+dotnet build TensorSharp.Server/TensorSharp.Server.csproj
+```
+
+### 构建原生 GGML 库
+
+如果原生库不存在，首次执行 `dotnet build` 时会自动构建。也可以手动构建：
+
+```bash
+cd TensorSharp.GGML.Native
+```
+
+macOS：
+
+```bash
+bash build-macos.sh
+```
+
+Linux（仅 CPU）：
+
+```bash
+bash build-linux.sh
+```
+
+Linux（启用 GGML_CUDA）：
+
+```bash
+bash build-linux.sh --cuda
+```
+
+也可以在 `dotnet build` 时通过环境变量请求 CUDA 版本的原生库：
+
+```bash
+TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release
+```
+
+在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上，`build-linux.sh` 会保留已有的 CUDA 构建，并在检测到 CUDA 工具链时自动启用 GGML_CUDA；也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。
+
+## 使用方法
+
+### 控制台应用
+
+```bash
+cd TensorSharp.Cli/bin
+
+# 文本推理
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
+    --max-tokens 200 --backend ggml_metal
+
+# Linux + NVIDIA GPU 文本推理
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
+    --max-tokens 200 --backend ggml_cuda
+
+# 图像推理（Gemma 3/4，Qwen 3.5）
+./TensorSharp.Cli --model <model.gguf> --image photo.png --backend ggml_metal
+
+# 视频推理（Gemma 4）
+./TensorSharp.Cli --model <model.gguf> --video clip.mp4 --backend ggml_metal
+
+# 音频推理（Gemma 4）
+./TensorSharp.Cli --model <model.gguf> --audio speech.wav --backend ggml_metal
+
+# 思维链 / 推理模式
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal --think
+
+# 工具调用
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
+    --tools tools.json
+
+# 使用采样参数
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
+    --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
+
+# 批处理（JSONL）
+./TensorSharp.Cli --model <model.gguf> --input-jsonl requests.jsonl \
+    --output results.txt --backend ggml_metal
+```
+
+**命令行参数：**
+
+| 参数 | 说明 |
+|---|---|
+| `--model <path>` | GGUF 模型文件路径（必填） |
+| `--input <path>` | 包含用户提示词的文本文件 |
+| `--input-jsonl <path>` | JSONL 批量请求文件（每行一个 JSON） |
+| `--multi-turn-jsonl <path>` | 用于多轮对话模拟（含 KV 缓存复用）的 JSONL 文件 |
+| `--output <path>` | 将生成文本写入该文件 |
+| `--image <path>` | 用于视觉推理的图像文件 |
+| `--video <path>` | 用于视频推理的视频文件 |
+| `--audio <path>` | 音频文件（WAV、MP3、OGG）用于音频推理 |
+| `--mmproj <path>` | 多模态投影器 GGUF 文件路径 |
+| `--max-tokens <N>` | 最大生成 token 数（默认：100） |
+| `--backend <type>` | 计算后端：`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda` |
+| `--think` | 启用思维链/推理模式 |
+| `--tools <path>` | 包含工具/函数定义的 JSON 文件 |
+| `--temperature <f>` | 采样温度（0 = 贪心） |
+| `--top-k <N>` | Top-K 过滤（0 = 关闭） |
+| `--top-p <f>` | Nucleus 采样阈值（1.0 = 关闭） |
+| `--min-p <f>` | 最小概率过滤（0 = 关闭） |
+| `--repeat-penalty <f>` | 重复惩罚（1.0 = 无） |
+| `--presence-penalty <f>` | 存在惩罚（0 = 关闭） |
+| `--frequency-penalty <f>` | 频率惩罚（0 = 关闭） |
+| `--seed <N>` | 随机种子（-1 = 非确定性） |
+| `--stop <string>` | 停止序列（可重复指定） |
+| `--test` | 运行内置测试套件 |
+
+如果把多模态投影器文件放在模型文件同目录并使用可识别命名（例如 `gemma-4-mmproj-F16.gguf`），系统会自动检测。
+
+**JSONL 输入格式：**
+
+每行是一个 JSON 对象，包含 `messages`、可选 `prompt` 和可选采样参数：
+
+```json
+{"id": "q1", "messages": [{"role": "user", "content": "What is 2+3?"}], "max_tokens": 50}
+{"id": "q2", "messages": [{"role": "user", "content": "Write a haiku."}], "max_tokens": 100, "temperature": 0.8}
+```
+
+### Web 应用
+
+```bash
+cd TensorSharp.Server/bin
+
+# 设置环境变量并运行
+MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server
+
+# Linux + NVIDIA GPU
+MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server
+```
+
+在浏览器中打开 `http://localhost:5000`。Web 界面支持：
+
+- 多轮聊天
+- 从 `MODEL_DIR` 中可用 GGUF 文件列表选择模型
+- 上传图像、视频和音频进行多模态推理（最大 500 MB）
+- 思维链/推理模式切换
+- 带函数定义的工具调用
+- 通过 Server-Sent Events 进行流式 token 生成
+- 带实时排队位置反馈的请求队列
+- 消息编辑和删除，支持从对话中任意位置重新生成
+
+**环境变量：**
+
+| 变量 | 说明 |
+|---|---|
+| `MODEL_DIR` | GGUF 模型文件所在目录 |
+| `BACKEND` | 计算后端：`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`（默认：macOS 为 `ggml_metal`，其他平台为 `ggml_cpu`） |
+| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限（默认：`4`） |
+| `PORT` | HTTP 端口（默认：`5000`） |
+
+### HTTP API
+
+TensorSharp.Server 暴露三种 API 风格。完整文档及 curl/Python 示例见 [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md)。
+
+**兼容 Ollama 的 API：**
+
+```bash
+# 列出模型
+curl http://localhost:5000/api/tags
+
+# 文本生成
+curl -X POST http://localhost:5000/api/generate \
+  -H "Content-Type: application/json" \
+  -d '{"model": "Qwen3-4B-Q8_0.gguf", "prompt": "Hello!", "stream": false}'
+
+# 聊天
+curl -X POST http://localhost:5000/api/chat/ollama \
+  -H "Content-Type: application/json" \
+  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "stream": false}'
+
+# 启用思维链模式的聊天
+curl -X POST http://localhost:5000/api/chat/ollama \
+  -H "Content-Type: application/json" \
+  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "计算 17*23"}], "think": true, "stream": false}'
+
+# 带工具调用的聊天
+curl -X POST http://localhost:5000/api/chat/ollama \
+  -H "Content-Type: application/json" \
+  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "天气怎么样？"}], "tools": [{"function": {"name": "get_weather", "description": "获取当前天气", "parameters": {"properties": {"city": {"type": "string"}}, "required": ["city"]}}}], "stream": false}'
+```
+
+**兼容 OpenAI 的 API：**
+
+```bash
+# Chat completions
+curl -X POST http://localhost:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 50}'
+```
+
+**OpenAI Python SDK：**
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:5000/v1", api_key="not-needed")
+response = client.chat.completions.create(
+    model="Qwen3-4B-Q8_0.gguf",
+    messages=[{"role": "user", "content": "What is 2+3?"}],
+    max_tokens=50
+)
+print(response.choices[0].message.content)
+```
+
+**队列状态：**
+
+```bash
+curl http://localhost:5000/api/queue/status
+# {"busy":false,"pending_requests":0,"total_processed":42}
+```
+
+## 思维链 / 推理模式
+
+支持思维链模式的模型（Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H）可以在生成最终答案之前产出结构化的思维链推理内容。思维内容与主要回复分开，客户端可选择显示或隐藏。
+
+- **Qwen 3 / Qwen 3.5 / Nemotron-H：** 使用 `<think>...</think>` 标签
+- **Gemma 4：** 使用 `<|channel>thought\n...<channel|>` 标签
+- **GPT OSS：** 使用 Harmony 格式，以 `<|channel|>analysis` 标记思维过程，以 `<|channel|>final` 标记最终回复
+
+通过 `--think`（控制台）、`"think": true`（Ollama API）或 Web 界面中的思维链开关启用。
+
+## 工具调用 / 函数调用
+
+模型可以调用用户定义的工具并参与多轮工具调用对话。将工具定义为 JSON 格式，通过 `--tools`（控制台）或 API 中的 `tools` 参数传入。
+
+各架构使用各自的工具调用格式：
+
+- **Qwen 3 / Qwen 3.5 / Nemotron-H：** `<tool_call>{"name": "...", "arguments": {...}}</tool_call>`
+- **Gemma 4：** `<|tool_call>call:function_name{args}<tool_call|>`
+
+输出解析器（`OutputParser.cs`）会自动从模型原始输出中提取工具调用，与架构无关。
+
+## 多模态支持
+
+### Gemma 4
+
+Gemma 4 模型支持图像、视频和音频输入。将多模态投影器（`gemma-4-mmproj-F16.gguf`）放在与模型文件相同目录即可自动加载。
+
+- **图像：** PNG、JPEG
+- **视频：** MP4（使用 OpenCV 以 1 fps 抽取最多 8 帧）
+- **音频：** WAV（16kHz 单声道）、MP3、OGG Vorbis
+
+### Gemma 3 / Qwen 3.5
+
+这两类模型支持图像输入，并需要对应的多模态投影器文件。
+
+### Mistral 3
+
+Mistral 3 通过 Pixtral 视觉编码器支持图像输入。将多模态投影器（`mistral3-mmproj.gguf`）放在与模型文件相同目录即可自动加载。
+
+- **图像：** PNG、JPEG
+
+## 架构说明
+
+TensorSharp 采用分层系统结构：
+
+1. **TensorSharp.Core** 提供核心 `Tensor` 类型、存储抽象和可扩展的操作注册表（`Ops`）。CPU 实现使用 `System.Numerics.Vectors` 进行 SIMD 加速。
+
+2. **TensorSharp.Runtime** 负责运行时契约与通用服务：GGUF 解析、分词（SentencePiece / BPE）、聊天模板渲染、可配置 token 采样、输出解析，以及 `IModelArchitecture`、`IPromptRenderer`、`IOutputProtocolParser`、`IMultimodalInjector`、`IKVCachePolicy`、`IBackendExecutionPlan` 等抽象。
+
+3. **TensorSharp.Models** 实现 `ModelBase` 以及各具体模型架构和多模态辅助组件（Gemma 3/4、Qwen 3/3.5、GPT OSS、Nemotron-H、Mistral 3）。模型通过 `ModelBase.Create()` 加载，并依据 GGUF 元数据自动识别架构。
+
+4. **TensorSharp.Backends.GGML** 通过原生 C++ 桥接库（`libGgmlOps`）注册同名操作的加速实现，并链接 [ggml](https://github.com/ggml-org/ggml)。在 macOS 上可提供 Metal GPU 计算，在 Linux 上可启用面向 NVIDIA GPU 的 GGML CUDA。操作包括原生量化 matmul（Q4_K_M、Q8_0 等），无需反量化到 FP32。
+
+5. **TensorSharp.Server** 是 HTTP / 应用层，提供兼容 Ollama 与 OpenAI 的 REST API、浏览器聊天 UI、上传处理和 FIFO 推理队列。
+
+6. **TensorSharp.Cli** 是控制台 / 应用层，用于本地 prompt 运行、多模态实验、prompt 检查和 JSONL 批处理。
+
+### 性能优化
+
+- **融合 GPU decode**（Gemma 4）：在 Metal 上将所有 Transformer 层合并为单次 GGML 计算图调度，将每个 token 的 CPU-GPU 往返从数百次降低到一次。相较逐算子调度约提升 2.6 倍。
+- **融合权重投影**：Q/K/V 投影融合为单次 QKV matmul；gate 与 up 投影融合为单次 gate_up matmul。
+- **原生量化计算**：量化权重（Q4_K_M、Q6_K、Q8_0 等）直接参与 matmul，无需展开为 FP32，节省内存与带宽。
+- **优化后的纯 C# CPU 路径**：托管 GEMM 快速路径和连续 Float32 内核加速了 decode、softmax、RMSNorm、RoPE、融合激活等热点路径，同时在 CPU 加载时保持量化 GGUF 权重压缩状态。
+- **环形 KV 缓存**：滑动窗口注意力层使用固定大小环形缓冲区，使内存占用不随序列长度增长。
+- **高内存效率模型加载**：大张量直接流式加载到原生内存，避免中间托管内存分配。
+
+## 测试
+
+TensorSharp.Server 的集成测试位于 `TensorSharp.Server/testdata/`。测试覆盖所有三种 API 风格（Web UI SSE、Ollama、OpenAI）、多轮对话、思维链模式、工具调用、队列行为、并发请求和中断支持。
+
+```bash
+# 先启动 TensorSharp.Server，然后运行：
+python3 TensorSharp.Server/testdata/test_multiturn.py
+# 或
+bash TensorSharp.Server/testdata/test_multiturn.sh
+```
+
+完整测试矩阵见 [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md)。
+
+## 作者
+
+Zhongkai Fu
+
+## 许可证
+
+详见 [LICENSE](LICENSE)。
+
diff --git a/TensorSharp.GGML/GgmlAllocator.cs b/TensorSharp.Backends.GGML/GgmlAllocator.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlAllocator.cs
rename to TensorSharp.Backends.GGML/GgmlAllocator.cs
diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.Backends.GGML/GgmlBasicOps.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlBasicOps.cs
rename to TensorSharp.Backends.GGML/GgmlBasicOps.cs
diff --git a/TensorSharp.GGML/GgmlContext.cs b/TensorSharp.Backends.GGML/GgmlContext.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlContext.cs
rename to TensorSharp.Backends.GGML/GgmlContext.cs
diff --git a/TensorSharp.GGML/GgmlGgufTensorDequant.cs b/TensorSharp.Backends.GGML/GgmlGgufTensorDequant.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlGgufTensorDequant.cs
rename to TensorSharp.Backends.GGML/GgmlGgufTensorDequant.cs
diff --git a/TensorSharp.GGML/GgmlLossOps.cs b/TensorSharp.Backends.GGML/GgmlLossOps.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlLossOps.cs
rename to TensorSharp.Backends.GGML/GgmlLossOps.cs
diff --git a/TensorSharp.GGML/GgmlMemoryPool.cs b/TensorSharp.Backends.GGML/GgmlMemoryPool.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlMemoryPool.cs
rename to TensorSharp.Backends.GGML/GgmlMemoryPool.cs
diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.Backends.GGML/GgmlNative.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlNative.cs
rename to TensorSharp.Backends.GGML/GgmlNative.cs
diff --git a/TensorSharp.GGML/GgmlStorage.cs b/TensorSharp.Backends.GGML/GgmlStorage.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlStorage.cs
rename to TensorSharp.Backends.GGML/GgmlStorage.cs
diff --git a/TensorSharp.GGML/TensorSharp.GGML.csproj b/TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj
similarity index 92%
rename from TensorSharp.GGML/TensorSharp.GGML.csproj
rename to TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj
index 2d93ad3..c272991 100644
--- a/TensorSharp.GGML/TensorSharp.GGML.csproj
+++ b/TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj
@@ -5,6 +5,8 @@
     <GenerateAssemblyInfo>false</GenerateAssemblyInfo>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <OutputPath>bin\</OutputPath>
+    <Description>GGML backend integration for TensorSharp model execution.</Description>
+    <PackageTags>tensor;backend;ggml;native</PackageTags>
   </PropertyGroup>
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
@@ -27,12 +29,11 @@
     <GgmlNativeForceBuild Condition="$([MSBuild]::IsOSPlatform('Linux')) And ('$(TENSORSHARP_GGML_NATIVE_ENABLE_CUDA)' == 'ON' Or '$(TENSORSHARP_GGML_NATIVE_ENABLE_CUDA)' == 'on' Or '$(TENSORSHARP_GGML_NATIVE_ENABLE_CUDA)' == 'true' Or '$(TENSORSHARP_GGML_NATIVE_ENABLE_CUDA)' == 'TRUE')">true</GgmlNativeForceBuild>
   </PropertyGroup>
   <ItemGroup>
-    <PackageReference Include="Microsoft.CSharp" Version="4.7.0" />
     <PackageReference Include="System.Data.DataSetExtensions" Version="4.5.0" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\AdvUtils\AdvUtils.csproj" />
-    <ProjectReference Include="..\TensorSharp\TensorSharp.csproj" />
+    <ProjectReference Include="..\TensorSharp.Core\TensorSharp.Core.csproj" />
   </ItemGroup>
   <Target Name="BuildGgmlNative" BeforeTargets="BeforeBuild" Condition="'$(GgmlNativeBuildScript)' != ''">
     <Exec Command="bash &quot;$(MSBuildProjectDirectory)/../TensorSharp.GGML.Native/$(GgmlNativeBuildScript)&quot; $(GgmlNativeBuildArgs)" />
diff --git a/TensorSharp.Cli/GlobalUsings.cs b/TensorSharp.Cli/GlobalUsings.cs
new file mode 100644
index 0000000..df7cdd3
--- /dev/null
+++ b/TensorSharp.Cli/GlobalUsings.cs
@@ -0,0 +1,2 @@
+global using TensorSharp.Models;
+global using TensorSharp.Runtime;
diff --git a/InferenceConsole/Program.cs b/TensorSharp.Cli/Program.cs
similarity index 92%
rename from InferenceConsole/Program.cs
rename to TensorSharp.Cli/Program.cs
index 5737f51..35254b7 100644
--- a/InferenceConsole/Program.cs
+++ b/TensorSharp.Cli/Program.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -14,14 +14,15 @@
 using System.Linq;
 using System.Text;
 using System.Text.Json;
-using InferenceEngine;
 using TensorSharp;
 using TensorSharp.Cpu;
 
-namespace InferenceConsole
+namespace TensorSharp.Cli
 {
     class Program
     {
+        private static readonly IPromptRenderer PromptRenderer = new GgufPromptRenderer();
+
         static void Main(string[] args)
         {
             Console.OutputEncoding = Encoding.UTF8;
@@ -112,7 +113,7 @@ static void Main(string[] args)
             if (modelPath == null || !File.Exists(modelPath))
             {
                 Console.Error.WriteLine($"Model file not found: {modelPath ?? "(none)"}");
-                Console.Error.WriteLine("Usage: InferenceConsole --model <path.gguf> [--input <input.txt>] " +
+                Console.Error.WriteLine("Usage: TensorSharp.Cli --model <path.gguf> [--input <input.txt>] " +
                     "[--input-jsonl <requests.jsonl>] [--image <image.png>] [--output <output.txt>] " +
                     "[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal|ggml_cuda]");
                 return;
@@ -129,49 +130,48 @@ static void Main(string[] args)
 
             using var model = ModelBase.Create(modelPath, backend);
 
-            if (mmProjPath != null && model is Gemma3Model gemma3WithVision)
-            {
-                gemma3WithVision.LoadVisionEncoder(mmProjPath);
-            }
-            else if (mmProjPath != null && model is Gemma4Model gemma4WithVision)
+            if (mmProjPath != null)
             {
-                gemma4WithVision.LoadVisionEncoder(mmProjPath);
-                if (audioPath != null)
-                    gemma4WithVision.LoadAudioEncoder(mmProjPath);
-            }
-            else if (mmProjPath != null && model is Qwen35Model qwen35WithVision)
-            {
-                qwen35WithVision.LoadVisionEncoder(mmProjPath);
+                model.MultimodalInjector.LoadProjectors(mmProjPath);
             }
             else if (imagePath != null && model.Config.Architecture == "gemma3")
             {
                 string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "mmproj-gemma3-4b-f16.gguf");
-                if (File.Exists(autoMmproj) && model is Gemma3Model g3auto)
+                if (File.Exists(autoMmproj))
                 {
                     Console.WriteLine($"Auto-loading vision encoder: {autoMmproj}");
-                    g3auto.LoadVisionEncoder(autoMmproj);
+                    model.MultimodalInjector.LoadProjectors(autoMmproj);
+                }
+            }
+            else if (imagePath != null && model.Config.Architecture == "mistral3")
+            {
+                string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "mistral3-mmproj.gguf");
+                if (File.Exists(autoMmproj))
+                {
+                    Console.WriteLine($"Auto-loading Mistral3 vision encoder: {autoMmproj}");
+                    model.MultimodalInjector.LoadProjectors(autoMmproj);
                 }
             }
             else if ((imagePath != null || audioPath != null || videoPath != null)
                      && model.Config.Architecture == "gemma4")
             {
                 string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "gemma-4-mmproj-F16.gguf");
-                if (File.Exists(autoMmproj) && model is Gemma4Model g4auto)
+                if (File.Exists(autoMmproj))
                 {
                     Console.WriteLine($"Auto-loading multimodal encoder: {autoMmproj}");
-                    if (imagePath != null || videoPath != null)
-                        g4auto.LoadVisionEncoder(autoMmproj);
-                    if (audioPath != null)
-                        g4auto.LoadAudioEncoder(autoMmproj);
+                    model.MultimodalInjector.LoadProjectors(autoMmproj);
                 }
             }
-            else if (imagePath != null && model is Qwen35Model q35auto)
+            else if (imagePath != null &&
+                     (model.Config.Architecture == "qwen35" ||
+                      model.Config.Architecture == "qwen35moe" ||
+                      model.Config.Architecture == "qwen3next"))
             {
                 string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "Qwen3.5-mmproj-F16.gguf");
                 if (File.Exists(autoMmproj))
                 {
                     Console.WriteLine($"Auto-loading vision encoder: {autoMmproj}");
-                    q35auto.LoadVisionEncoder(autoMmproj);
+                    model.MultimodalInjector.LoadProjectors(autoMmproj);
                 }
             }
 
@@ -249,7 +249,7 @@ static void Main(string[] args)
                 {
                     new ChatMessage { Role = "user", Content = rawText }
                 };
-                string rendered = ChatTemplate.RenderFromGgufTemplate(
+                string rendered = PromptRenderer.Render(
                     model.Config.ChatTemplate, dumpMessages, addGenerationPrompt: true,
                     architecture: model.Config.Architecture, tools: tools, enableThinking: enableThinking);
                 Console.WriteLine("=== Rendered Prompt ===");
@@ -325,7 +325,7 @@ static void RunMultiTurnTest(ModelBase model, string jsonlPath, int maxTokens,
                 history.Add(new ChatMessage { Role = "user", Content = userMsg });
                 Console.WriteLine($"\n[Turn {turn + 1}/{lines.Length}] User: {userMsg}");
 
-                string rendered = ChatTemplate.RenderFromGgufTemplate(
+                string rendered = PromptRenderer.Render(
                     model.Config.ChatTemplate, history, addGenerationPrompt: true,
                     architecture: arch, enableThinking: enableThinking);
 
@@ -485,7 +485,7 @@ static void RunJsonlBatch(ModelBase model, string inputJsonlPath, string outputF
                     bool reqThinking = enableThinking ||
                         (root.TryGetProperty("enable_thinking", out var etProp) && etProp.GetBoolean());
 
-                    string rendered = ChatTemplate.RenderFromGgufTemplate(
+                    string rendered = PromptRenderer.Render(
                         model.Config.ChatTemplate, messages, addGenerationPrompt: true,
                         architecture: model.Config.Architecture, enableThinking: reqThinking);
 
@@ -655,7 +655,7 @@ static string RunInference(ModelBase model, string rawText, List<string> imagePa
                 new ChatMessage { Role = "user", Content = rawText, ImagePaths = imagePaths, AudioPaths = audioPaths, IsVideo = isVideo }
             };
 
-            string rendered = ChatTemplate.RenderFromGgufTemplate(
+            string rendered = PromptRenderer.Render(
                 model.Config.ChatTemplate, messages, addGenerationPrompt: true,
                 architecture: model.Config.Architecture,
                 tools: tools, enableThinking: enableThinking);
@@ -790,6 +790,69 @@ static string RunInference(ModelBase model, string rawText, List<string> imagePa
                         Console.WriteLine("Note: No vision encoder loaded. Use --mmproj to specify the vision encoder GGUF.");
                     }
                 }
+                else if (arch == "mistral3")
+                {
+                    if (model is Mistral3Model m3 && m3.VisionEncoder != null)
+                    {
+                        var proc = new Mistral3ImageProcessor(
+                            m3.VisionEncoder.ImageSize,
+                            m3.VisionEncoder.PatchSize);
+
+                        int imgTokenId = Mistral3ImageProcessor.ImgTokenId;
+                        int imgBreakId = Mistral3ImageProcessor.ImgBreakTokenId;
+                        int imgEndId = Mistral3ImageProcessor.ImgEndTokenId;
+
+                        foreach (var imgP in imagePaths)
+                        {
+                            var (pixels, imgW, imgH) = proc.ProcessImage(imgP);
+                            var visionEmb = m3.VisionEncoder.Encode(pixels, imgW, imgH);
+                            int numRows = imgH / m3.VisionEncoder.PatchSize / m3.VisionEncoder.SpatialMergeSize;
+                            int numCols = imgW / m3.VisionEncoder.PatchSize / m3.VisionEncoder.SpatialMergeSize;
+
+                            int tokenPosition = -1;
+                            for (int i = 0; i < inputTokens.Count; i++)
+                            {
+                                if (inputTokens[i] == imgTokenId)
+                                {
+                                    tokenPosition = i;
+                                    break;
+                                }
+                            }
+
+                            if (tokenPosition >= 0)
+                            {
+                                var expanded = new List<int>();
+                                for (int i = 0; i < tokenPosition; i++)
+                                    expanded.Add(inputTokens[i]);
+
+                                for (int row = 0; row < numRows; row++)
+                                {
+                                    for (int col = 0; col < numCols; col++)
+                                        expanded.Add(imgTokenId);
+                                    expanded.Add(row == numRows - 1 ? imgEndId : imgBreakId);
+                                }
+
+                                for (int i = tokenPosition + 1; i < inputTokens.Count; i++)
+                                    expanded.Add(inputTokens[i]);
+
+                                m3.SetVisionEmbeddings(visionEmb, tokenPosition);
+                                inputTokens = expanded;
+                                Console.WriteLine($"Mistral3 vision: {numRows}x{numCols} merged patches, " +
+                                    $"{numRows * numCols + numRows} total tokens at pos {tokenPosition}");
+                            }
+                            else
+                            {
+                                visionEmb.Dispose();
+                                Console.WriteLine("Warning: No [IMG] token found in prompt");
+                            }
+                        }
+                        Console.WriteLine($"Total tokens after image expansion: {inputTokens.Count}");
+                    }
+                    else
+                    {
+                        Console.WriteLine("Note: No vision encoder loaded. Use --mmproj to specify the vision encoder GGUF.");
+                    }
+                }
                 else
                 {
                     int imagePadId = model.Tokenizer.LookupToken("<|image_pad|>");
@@ -1382,3 +1445,6 @@ static string Escape(string s)
         }
     }
 }
+
+
+
diff --git a/InferenceConsole/InferenceConsole.csproj b/TensorSharp.Cli/TensorSharp.Cli.csproj
similarity index 69%
rename from InferenceConsole/InferenceConsole.csproj
rename to TensorSharp.Cli/TensorSharp.Cli.csproj
index 6884313..c876cf9 100644
--- a/InferenceConsole/InferenceConsole.csproj
+++ b/TensorSharp.Cli/TensorSharp.Cli.csproj
@@ -5,6 +5,7 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <OutputPath>bin\</OutputPath>
+    <Description>Command-line host for TensorSharp model inference and diagnostics.</Description>
   </PropertyGroup>
   <PropertyGroup>
     <GgmlNativeBuildDir>$(MSBuildProjectDirectory)/../TensorSharp.GGML.Native/build</GgmlNativeBuildDir>
@@ -12,7 +13,10 @@
     <GgmlNativeBinaryName Condition="$([MSBuild]::IsOSPlatform('Linux'))">libGgmlOps.so</GgmlNativeBinaryName>
   </PropertyGroup>
   <ItemGroup>
-    <ProjectReference Include="..\InferenceEngine\InferenceEngine.csproj" />
+    <ProjectReference Include="..\TensorSharp.Core\TensorSharp.Core.csproj" />
+    <ProjectReference Include="..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
+    <ProjectReference Include="..\TensorSharp.Models\TensorSharp.Models.csproj" />
+    <ProjectReference Include="..\TensorSharp.Backends.GGML\TensorSharp.Backends.GGML.csproj" />
   </ItemGroup>
   <Target Name="CopyGgmlNativeBinary" AfterTargets="Build" Condition="'$(GgmlNativeBinaryName)' != '' And Exists('$(GgmlNativeBuildDir)/$(GgmlNativeBinaryName)')">
     <Copy SourceFiles="$(GgmlNativeBuildDir)/$(GgmlNativeBinaryName)" DestinationFolder="$(OutputPath)" SkipUnchangedFiles="true" />
diff --git a/InferenceConsole/test_requests.jsonl b/TensorSharp.Cli/test_requests.jsonl
similarity index 100%
rename from InferenceConsole/test_requests.jsonl
rename to TensorSharp.Cli/test_requests.jsonl
diff --git a/InferenceConsole/testdata/batch_thinking.jsonl b/TensorSharp.Cli/testdata/batch_thinking.jsonl
similarity index 100%
rename from InferenceConsole/testdata/batch_thinking.jsonl
rename to TensorSharp.Cli/testdata/batch_thinking.jsonl
diff --git a/InferenceConsole/testdata/example_api_thinking_tools.md b/TensorSharp.Cli/testdata/example_api_thinking_tools.md
similarity index 96%
rename from InferenceConsole/testdata/example_api_thinking_tools.md
rename to TensorSharp.Cli/testdata/example_api_thinking_tools.md
index 057a787..6f42bde 100644
--- a/InferenceConsole/testdata/example_api_thinking_tools.md
+++ b/TensorSharp.Cli/testdata/example_api_thinking_tools.md
@@ -1,4 +1,4 @@
-# Thinking Mode and Tool Call Examples
+﻿# Thinking Mode and Tool Call Examples
 
 ## Console Application
 
@@ -8,11 +8,11 @@ Enable thinking mode with `--think`. The model will show its reasoning process b
 
 ```bash
 # Basic thinking mode
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
   --input testdata/input_thinking.txt --think --max-tokens 500
 
 # Thinking mode with sampling
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
   --input testdata/input_thinking.txt --think --max-tokens 500 \
   --temperature 0.6 --top-p 0.95
 ```
@@ -23,17 +23,17 @@ Provide tool definitions via `--tools <file.json>`. The model will output struct
 
 ```bash
 # Weather tool call
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
   --input testdata/input_tool_call.txt \
   --tools testdata/tools_weather.json --max-tokens 300
 
 # Calculator tool call
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
   --input testdata/input_tool_calc.txt \
   --tools testdata/tools_calculator.json --max-tokens 300
 
 # Combined: thinking + tools
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
   --input testdata/input_tool_call.txt \
   --tools testdata/tools_weather.json --think --max-tokens 500
 ```
@@ -347,3 +347,4 @@ When `tools` are provided:
 1. **Gemma4**: Tool declarations use `<|tool>declaration:NAME{...}<tool|>` format in the system turn. The model outputs calls as `<|tool_call>call:NAME{key:<|"|>value<|"|>}<tool_call|>`.
 2. **Qwen3**: Tool definitions are injected as JSON in the system message. The model outputs calls as `<tool_call>{"name":"...","arguments":{...}}</tool_call>`.
 3. **Qwen3.5**: Tool definitions use `<tools>...</tools>` format. The model outputs calls as `<tool_call><function=NAME><parameter=key>\nvalue\n</parameter></function></tool_call>`.
+
diff --git a/InferenceConsole/testdata/input_thinking.txt b/TensorSharp.Cli/testdata/input_thinking.txt
similarity index 100%
rename from InferenceConsole/testdata/input_thinking.txt
rename to TensorSharp.Cli/testdata/input_thinking.txt
diff --git a/InferenceConsole/testdata/input_tool_calc.txt b/TensorSharp.Cli/testdata/input_tool_calc.txt
similarity index 100%
rename from InferenceConsole/testdata/input_tool_calc.txt
rename to TensorSharp.Cli/testdata/input_tool_calc.txt
diff --git a/InferenceConsole/testdata/input_tool_call.txt b/TensorSharp.Cli/testdata/input_tool_call.txt
similarity index 100%
rename from InferenceConsole/testdata/input_tool_call.txt
rename to TensorSharp.Cli/testdata/input_tool_call.txt
diff --git a/InferenceConsole/testdata/tools_calculator.json b/TensorSharp.Cli/testdata/tools_calculator.json
similarity index 100%
rename from InferenceConsole/testdata/tools_calculator.json
rename to TensorSharp.Cli/testdata/tools_calculator.json
diff --git a/InferenceConsole/testdata/tools_weather.json b/TensorSharp.Cli/testdata/tools_weather.json
similarity index 100%
rename from InferenceConsole/testdata/tools_weather.json
rename to TensorSharp.Cli/testdata/tools_weather.json
diff --git a/TensorSharp/Core/DelegateDisposable.cs b/TensorSharp.Core/Core/DelegateDisposable.cs
similarity index 100%
rename from TensorSharp/Core/DelegateDisposable.cs
rename to TensorSharp.Core/Core/DelegateDisposable.cs
diff --git a/TensorSharp/Core/TensorConcatenation.cs b/TensorSharp.Core/Core/TensorConcatenation.cs
similarity index 100%
rename from TensorSharp/Core/TensorConcatenation.cs
rename to TensorSharp.Core/Core/TensorConcatenation.cs
diff --git a/TensorSharp/Core/TensorResultBuilder.cs b/TensorSharp.Core/Core/TensorResultBuilder.cs
similarity index 100%
rename from TensorSharp/Core/TensorResultBuilder.cs
rename to TensorSharp.Core/Core/TensorResultBuilder.cs
diff --git a/TensorSharp/Cpu/CpuAllocator.cs b/TensorSharp.Core/Cpu/CpuAllocator.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuAllocator.cs
rename to TensorSharp.Core/Cpu/CpuAllocator.cs
diff --git a/TensorSharp/Cpu/CpuBasicOps.cs b/TensorSharp.Core/Cpu/CpuBasicOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuBasicOps.cs
rename to TensorSharp.Core/Cpu/CpuBasicOps.cs
diff --git a/TensorSharp/Cpu/CpuFillCopyOps.cs b/TensorSharp.Core/Cpu/CpuFillCopyOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuFillCopyOps.cs
rename to TensorSharp.Core/Cpu/CpuFillCopyOps.cs
diff --git a/TensorSharp/Cpu/CpuIndexingOps.cs b/TensorSharp.Core/Cpu/CpuIndexingOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuIndexingOps.cs
rename to TensorSharp.Core/Cpu/CpuIndexingOps.cs
diff --git a/TensorSharp/Cpu/CpuMaxPoolingOps.cs b/TensorSharp.Core/Cpu/CpuMaxPoolingOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuMaxPoolingOps.cs
rename to TensorSharp.Core/Cpu/CpuMaxPoolingOps.cs
diff --git a/TensorSharp/Cpu/CpuNativeHelpers.cs b/TensorSharp.Core/Cpu/CpuNativeHelpers.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuNativeHelpers.cs
rename to TensorSharp.Core/Cpu/CpuNativeHelpers.cs
diff --git a/TensorSharp/Cpu/CpuOpsNative.cs b/TensorSharp.Core/Cpu/CpuOpsNative.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuOpsNative.cs
rename to TensorSharp.Core/Cpu/CpuOpsNative.cs
diff --git a/TensorSharp/Cpu/CpuRandom.cs b/TensorSharp.Core/Cpu/CpuRandom.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuRandom.cs
rename to TensorSharp.Core/Cpu/CpuRandom.cs
diff --git a/TensorSharp/Cpu/CpuStorage.cs b/TensorSharp.Core/Cpu/CpuStorage.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuStorage.cs
rename to TensorSharp.Core/Cpu/CpuStorage.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/DGEMM.cs b/TensorSharp.Core/Cpu/LinearAlgebra/DGEMM.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/DGEMM.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/DGEMM.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/LSAME.cs b/TensorSharp.Core/Cpu/LinearAlgebra/LSAME.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/LSAME.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/LSAME.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/SGEMM.cs b/TensorSharp.Core/Cpu/LinearAlgebra/SGEMM.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/SGEMM.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/SGEMM.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/XERBLA.cs b/TensorSharp.Core/Cpu/LinearAlgebra/XERBLA.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/XERBLA.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/XERBLA.cs
diff --git a/TensorSharp/Cpu/MatrixMultiplication.cs b/TensorSharp.Core/Cpu/MatrixMultiplication.cs
similarity index 100%
rename from TensorSharp/Cpu/MatrixMultiplication.cs
rename to TensorSharp.Core/Cpu/MatrixMultiplication.cs
diff --git a/TensorSharp/Cpu/NativeWrapper.cs b/TensorSharp.Core/Cpu/NativeWrapper.cs
similarity index 100%
rename from TensorSharp/Cpu/NativeWrapper.cs
rename to TensorSharp.Core/Cpu/NativeWrapper.cs
diff --git a/TensorSharp/Cpu/OpenBlasNative.cs b/TensorSharp.Core/Cpu/OpenBlasNative.cs
similarity index 100%
rename from TensorSharp/Cpu/OpenBlasNative.cs
rename to TensorSharp.Core/Cpu/OpenBlasNative.cs
diff --git a/TensorSharp/Cpu/SpatialConvolutionMM.cs b/TensorSharp.Core/Cpu/SpatialConvolutionMM.cs
similarity index 100%
rename from TensorSharp/Cpu/SpatialConvolutionMM.cs
rename to TensorSharp.Core/Cpu/SpatialConvolutionMM.cs
diff --git a/TensorSharp/DType.cs b/TensorSharp.Core/DType.cs
similarity index 100%
rename from TensorSharp/DType.cs
rename to TensorSharp.Core/DType.cs
diff --git a/TensorSharp/Expression/SExpression.cs b/TensorSharp.Core/Expression/SExpression.cs
similarity index 100%
rename from TensorSharp/Expression/SExpression.cs
rename to TensorSharp.Core/Expression/SExpression.cs
diff --git a/TensorSharp/Expression/SVar.cs b/TensorSharp.Core/Expression/SVar.cs
similarity index 100%
rename from TensorSharp/Expression/SVar.cs
rename to TensorSharp.Core/Expression/SVar.cs
diff --git a/TensorSharp/Expression/TExpression.cs b/TensorSharp.Core/Expression/TExpression.cs
similarity index 100%
rename from TensorSharp/Expression/TExpression.cs
rename to TensorSharp.Core/Expression/TExpression.cs
diff --git a/TensorSharp/Expression/TVar.cs b/TensorSharp.Core/Expression/TVar.cs
similarity index 100%
rename from TensorSharp/Expression/TVar.cs
rename to TensorSharp.Core/Expression/TVar.cs
diff --git a/TensorSharp/Half.cs b/TensorSharp.Core/Half.cs
similarity index 100%
rename from TensorSharp/Half.cs
rename to TensorSharp.Core/Half.cs
diff --git a/TensorSharp/IAllocator.cs b/TensorSharp.Core/IAllocator.cs
similarity index 100%
rename from TensorSharp/IAllocator.cs
rename to TensorSharp.Core/IAllocator.cs
diff --git a/TensorSharp/IBasicOps.cs b/TensorSharp.Core/IBasicOps.cs
similarity index 100%
rename from TensorSharp/IBasicOps.cs
rename to TensorSharp.Core/IBasicOps.cs
diff --git a/TensorSharp/OpConstraint.cs b/TensorSharp.Core/OpConstraint.cs
similarity index 100%
rename from TensorSharp/OpConstraint.cs
rename to TensorSharp.Core/OpConstraint.cs
diff --git a/TensorSharp/OpRegistry.cs b/TensorSharp.Core/OpRegistry.cs
similarity index 100%
rename from TensorSharp/OpRegistry.cs
rename to TensorSharp.Core/OpRegistry.cs
diff --git a/TensorSharp/OpRegistryAttributes.cs b/TensorSharp.Core/OpRegistryAttributes.cs
similarity index 100%
rename from TensorSharp/OpRegistryAttributes.cs
rename to TensorSharp.Core/OpRegistryAttributes.cs
diff --git a/TensorSharp/Ops.cs b/TensorSharp.Core/Ops.cs
similarity index 100%
rename from TensorSharp/Ops.cs
rename to TensorSharp.Core/Ops.cs
diff --git a/TensorSharp/Properties/AssemblyInfo.cs b/TensorSharp.Core/Properties/AssemblyInfo.cs
similarity index 100%
rename from TensorSharp/Properties/AssemblyInfo.cs
rename to TensorSharp.Core/Properties/AssemblyInfo.cs
diff --git a/TensorSharp/Properties/PublishProfiles/FolderProfile.pubxml b/TensorSharp.Core/Properties/PublishProfiles/FolderProfile.pubxml
similarity index 100%
rename from TensorSharp/Properties/PublishProfiles/FolderProfile.pubxml
rename to TensorSharp.Core/Properties/PublishProfiles/FolderProfile.pubxml
diff --git a/TensorSharp/Properties/launchSettings.json b/TensorSharp.Core/Properties/launchSettings.json
similarity index 100%
rename from TensorSharp/Properties/launchSettings.json
rename to TensorSharp.Core/Properties/launchSettings.json
diff --git a/TensorSharp/RandomGenerator.cs b/TensorSharp.Core/RandomGenerator.cs
similarity index 100%
rename from TensorSharp/RandomGenerator.cs
rename to TensorSharp.Core/RandomGenerator.cs
diff --git a/TensorSharp/RefCounted.cs b/TensorSharp.Core/RefCounted.cs
similarity index 100%
rename from TensorSharp/RefCounted.cs
rename to TensorSharp.Core/RefCounted.cs
diff --git a/TensorSharp/ReflectionExtensions.cs b/TensorSharp.Core/ReflectionExtensions.cs
similarity index 100%
rename from TensorSharp/ReflectionExtensions.cs
rename to TensorSharp.Core/ReflectionExtensions.cs
diff --git a/TensorSharp/Storage.cs b/TensorSharp.Core/Storage.cs
similarity index 100%
rename from TensorSharp/Storage.cs
rename to TensorSharp.Core/Storage.cs
diff --git a/TensorSharp/Tensor.cs b/TensorSharp.Core/Tensor.cs
similarity index 100%
rename from TensorSharp/Tensor.cs
rename to TensorSharp.Core/Tensor.cs
diff --git a/TensorSharp/TensorApplyCPU.cs b/TensorSharp.Core/TensorApplyCPU.cs
similarity index 100%
rename from TensorSharp/TensorApplyCPU.cs
rename to TensorSharp.Core/TensorApplyCPU.cs
diff --git a/TensorSharp/TensorDimIterState.cs b/TensorSharp.Core/TensorDimIterState.cs
similarity index 100%
rename from TensorSharp/TensorDimIterState.cs
rename to TensorSharp.Core/TensorDimIterState.cs
diff --git a/TensorSharp/TensorDimensionHelpers.cs b/TensorSharp.Core/TensorDimensionHelpers.cs
similarity index 100%
rename from TensorSharp/TensorDimensionHelpers.cs
rename to TensorSharp.Core/TensorDimensionHelpers.cs
diff --git a/TensorSharp/TensorFormatting.cs b/TensorSharp.Core/TensorFormatting.cs
similarity index 100%
rename from TensorSharp/TensorFormatting.cs
rename to TensorSharp.Core/TensorFormatting.cs
diff --git a/TensorSharp/TensorIterState.cs b/TensorSharp.Core/TensorIterState.cs
similarity index 100%
rename from TensorSharp/TensorIterState.cs
rename to TensorSharp.Core/TensorIterState.cs
diff --git a/TensorSharp/TensorSerialization.cs b/TensorSharp.Core/TensorSerialization.cs
similarity index 100%
rename from TensorSharp/TensorSerialization.cs
rename to TensorSharp.Core/TensorSerialization.cs
diff --git a/TensorSharp/TensorSharp.csproj b/TensorSharp.Core/TensorSharp.Core.csproj
similarity index 93%
rename from TensorSharp/TensorSharp.csproj
rename to TensorSharp.Core/TensorSharp.Core.csproj
index 126e06e..e2bb2a3 100644
--- a/TensorSharp/TensorSharp.csproj
+++ b/TensorSharp.Core/TensorSharp.Core.csproj
@@ -5,6 +5,8 @@
     <GenerateAssemblyInfo>false</GenerateAssemblyInfo>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <OutputPath>bin\</OutputPath>
+    <Description>TensorSharp core tensor primitives, ops, memory management, and device abstractions.</Description>
+    <PackageTags>tensor;core;ops;memory;device</PackageTags>
   </PropertyGroup>
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
@@ -54,4 +56,4 @@
 			<BuildOutputInPackage Include="@(ReferenceCopyLocalPaths-&gt;WithMetadataValue('ReferenceSourceTarget', 'ProjectReference'))" />
 		</ItemGroup>
 	</Target>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj
index 36d8be5..ed8a773 100644
--- a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj
+++ b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj
@@ -7,6 +7,7 @@
     <Nullable>enable</Nullable>
   </PropertyGroup>
   <ItemGroup>
-    <ProjectReference Include="..\..\..\InferenceEngine\InferenceEngine.csproj" />
+    <ProjectReference Include="..\..\..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
+    <ProjectReference Include="..\..\..\TensorSharp.Models\TensorSharp.Models.csproj" />
   </ItemGroup>
 </Project>
diff --git a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs
index 1962d59..9cfea69 100644
--- a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs
+++ b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs
@@ -1,6 +1,7 @@
 using System;
 using System.IO;
-using InferenceEngine;
+using TensorSharp.Models;
+using TensorSharp.Runtime;
 
 static BackendType ParseBackend(string backend) => backend.ToLowerInvariant() switch
 {
diff --git a/TensorSharp.Models/BackendExecutionPlan.cs b/TensorSharp.Models/BackendExecutionPlan.cs
new file mode 100644
index 0000000..4fdb90d
--- /dev/null
+++ b/TensorSharp.Models/BackendExecutionPlan.cs
@@ -0,0 +1,32 @@
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+namespace TensorSharp.Models
+{
+    internal sealed class BackendExecutionPlan : IBackendExecutionPlan
+    {
+        public BackendExecutionPlan(BackendType backendType)
+        {
+            BackendType = backendType;
+        }
+
+        public BackendType BackendType { get; }
+
+        public bool UsesGgmlBackend =>
+            BackendType == BackendType.GgmlCpu ||
+            BackendType == BackendType.GgmlMetal ||
+            BackendType == BackendType.GgmlCuda;
+
+        public bool ShouldStoreWeightQuantized(GgufTensorInfo info)
+        {
+            return ModelBase.ShouldStoreWeightQuantized(BackendType, info);
+        }
+    }
+}
+
diff --git a/TensorSharp.Models/GlobalUsings.cs b/TensorSharp.Models/GlobalUsings.cs
new file mode 100644
index 0000000..e0a9f20
--- /dev/null
+++ b/TensorSharp.Models/GlobalUsings.cs
@@ -0,0 +1 @@
+global using TensorSharp.Runtime;
diff --git a/InferenceEngine/Half.cs b/TensorSharp.Models/Half.cs
similarity index 96%
rename from InferenceEngine/Half.cs
rename to TensorSharp.Models/Half.cs
index 395e4b7..2ac2393 100644
--- a/InferenceEngine/Half.cs
+++ b/TensorSharp.Models/Half.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -10,7 +10,7 @@
 using System;
 using System.Runtime.InteropServices;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     [StructLayout(LayoutKind.Sequential)]
     public struct half
@@ -71,3 +71,4 @@ private static float HalfToFloat(ushort value)
         }
     }
 }
+
diff --git a/InferenceEngine/ManagedQuantizedOps.cs b/TensorSharp.Models/ManagedQuantizedOps.cs
similarity index 99%
rename from InferenceEngine/ManagedQuantizedOps.cs
rename to TensorSharp.Models/ManagedQuantizedOps.cs
index 751044f..70ca137 100644
--- a/InferenceEngine/ManagedQuantizedOps.cs
+++ b/TensorSharp.Models/ManagedQuantizedOps.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -12,7 +12,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     internal static class ManagedQuantizedOps
     {
@@ -696,3 +696,4 @@ private static unsafe void GetScaleMinK4(int j, byte* q, out byte d, out byte m)
         }
     }
 }
+
diff --git a/InferenceEngine/MediaHelper.cs b/TensorSharp.Models/MediaHelper.cs
similarity index 98%
rename from InferenceEngine/MediaHelper.cs
rename to TensorSharp.Models/MediaHelper.cs
index 34e899a..3b3b6e0 100644
--- a/InferenceEngine/MediaHelper.cs
+++ b/TensorSharp.Models/MediaHelper.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -14,7 +14,7 @@
 using System.Runtime.InteropServices;
 using OpenCvSharp;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public static class MediaHelper
     {
@@ -223,3 +223,4 @@ private static uint Crc32Png(byte[] type, byte[] data)
         }
     }
 }
+
diff --git a/InferenceEngine/ModelBase.cs b/TensorSharp.Models/ModelBase.cs
similarity index 96%
rename from InferenceEngine/ModelBase.cs
rename to TensorSharp.Models/ModelBase.cs
index cb9ba9a..c0e62ff 100644
--- a/InferenceEngine/ModelBase.cs
+++ b/TensorSharp.Models/ModelBase.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -19,40 +19,8 @@
 using TensorSharp.Cpu;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
-    public enum BackendType
-    {
-        Cpu,
-        GgmlCpu,
-        GgmlMetal,
-        GgmlCuda,
-    }
-
-    public class ModelConfig
-    {
-        public string Architecture { get; set; }
-        public int HiddenSize { get; set; }
-        public int NumHeads { get; set; }
-        public int NumKVHeads { get; set; }
-        public int KeyLength { get; set; }
-        public int ValueLength { get; set; }
-        public float Eps { get; set; }
-        public float RopeBase { get; set; }
-        public float RopeScale { get; set; } = 1f;
-        public int NumLayers { get; set; }
-        public int VocabSize { get; set; }
-        public int IntermediateSize { get; set; }
-        public string ChatTemplate { get; set; }
-
-        public int NumExperts { get; set; }
-        public int NumExpertsUsed { get; set; }
-        public int SlidingWindow { get; set; }
-        public int OriginalContextLength { get; set; }
-
-        public int HeadDim => KeyLength > 0 ? KeyLength : (ValueLength > 0 ? ValueLength : HiddenSize / NumHeads);
-    }
-
     public class QuantizedWeight : IDisposable
     {
         public IntPtr Data { get; }
@@ -109,10 +77,13 @@ public static unsafe void FreeBuffer(IntPtr ptr)
         }
     }
 
-    public abstract class ModelBase : IDisposable
+    public abstract class ModelBase : IModelArchitecture
     {
         public ModelConfig Config { get; protected set; }
         public ITokenizer Tokenizer { get; protected set; }
+        public IKVCachePolicy KVCachePolicy { get; } = DefaultKvCachePolicy.Shared;
+        public IMultimodalInjector MultimodalInjector { get; }
+        public IBackendExecutionPlan ExecutionPlan { get; }
 
         protected readonly GgufFile _gguf;
         private readonly GgmlContext _ggmlContext;
@@ -124,8 +95,12 @@ public abstract class ModelBase : IDisposable
         private bool _quantBackendReady;
 
         protected int _cacheSeqLen;
+        protected int _maxContextLength;
         protected float[] _logitsBuffer;
 
+        public int MaxContextLength => _maxContextLength;
+        public int CacheSeqLen => _cacheSeqLen;
+
         // Timing
         protected long _linearTicks;
         protected long _attnTicks;
@@ -137,6 +112,8 @@ public abstract class ModelBase : IDisposable
         protected ModelBase(string ggufPath, BackendType backend)
         {
             _backend = backend;
+            ExecutionPlan = new BackendExecutionPlan(backend);
+            MultimodalInjector = new ModelMultimodalInjector(this);
             switch (backend)
             {
                 case BackendType.GgmlCpu:
@@ -162,9 +139,7 @@ protected ModelBase(string ggufPath, BackendType backend)
             _gguf = new GgufFile(ggufPath);
         }
 
-        protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu ||
-                                        _backend == BackendType.GgmlMetal ||
-                                        _backend == BackendType.GgmlCuda;
+        protected bool IsGgmlBackend => ExecutionPlan.UsesGgmlBackend;
 
         protected void EnsureQuantBackendAvailable()
         {
@@ -240,7 +215,7 @@ protected void ParseTokenizer()
 
         protected virtual bool IsQuantizedLinearWeight(GgufTensorInfo info)
         {
-            return ShouldStoreWeightQuantized(_backend, info);
+            return ExecutionPlan.ShouldStoreWeightQuantized(info);
         }
 
         internal static bool ShouldStoreWeightQuantized(BackendType backend, GgufTensorInfo info)
@@ -970,8 +945,10 @@ public static ModelBase Create(string ggufPath, BackendType backend)
                 "gemma4" => new Gemma4Model(ggufPath, backend),
                 "gptoss" or "gpt-oss" => new GptOssModel(ggufPath, backend),
                 "nemotron_h" or "nemotron_h_moe" => new NemotronModel(ggufPath, backend),
+                "mistral3" => new Mistral3Model(ggufPath, backend),
                 _ => throw new NotSupportedException($"Unsupported architecture: {arch}"),
             };
         }
     }
 }
+
diff --git a/TensorSharp.Models/ModelMultimodalInjector.cs b/TensorSharp.Models/ModelMultimodalInjector.cs
new file mode 100644
index 0000000..6f12f46
--- /dev/null
+++ b/TensorSharp.Models/ModelMultimodalInjector.cs
@@ -0,0 +1,345 @@
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.Collections.Generic;
+using TensorSharp;
+
+namespace TensorSharp.Models
+{
+    internal sealed class ModelMultimodalInjector : IMultimodalInjector
+    {
+        private readonly ModelBase _model;
+
+        public ModelMultimodalInjector(ModelBase model)
+        {
+            _model = model;
+        }
+
+        public void LoadProjectors(string mmProjPath)
+        {
+            if (string.IsNullOrWhiteSpace(mmProjPath))
+                return;
+
+            switch (_model)
+            {
+                case Gemma4Model g4:
+                    g4.LoadVisionEncoder(mmProjPath);
+                    g4.LoadAudioEncoder(mmProjPath);
+                    break;
+                case Gemma3Model g3:
+                    g3.LoadVisionEncoder(mmProjPath);
+                    break;
+                case Qwen35Model q35:
+                    q35.LoadVisionEncoder(mmProjPath);
+                    break;
+                case Mistral3Model m3:
+                    m3.LoadVisionEncoder(mmProjPath);
+                    break;
+            }
+        }
+
+        public List<int> ProcessPromptTokens(List<ChatMessage> history, List<int> inputTokens)
+        {
+            if (history == null || history.Count == 0 || inputTokens == null || inputTokens.Count == 0)
+                return inputTokens;
+
+            if (_model is Gemma4Model g4)
+                return ProcessGemma4History(g4, history, inputTokens);
+            if (_model is Gemma3Model g3)
+                return ProcessGemma3History(g3, history, inputTokens);
+            if (_model is Qwen35Model q35)
+                return ProcessQwen35History(q35, history, inputTokens);
+            if (_model is Mistral3Model m3)
+                return ProcessMistral3History(m3, history, inputTokens);
+
+            return inputTokens;
+        }
+
+        private List<int> ProcessGemma4History(Gemma4Model model, List<ChatMessage> history, List<int> inputTokens)
+        {
+            int imageStartId = _model.Tokenizer.LookupToken("<|image>");
+            int imageEndId = _model.Tokenizer.LookupToken("<image|>");
+            if (imageStartId < 0) imageStartId = 255999;
+            if (imageEndId < 0) imageEndId = 256000;
+
+            int audioStartId = _model.Tokenizer.LookupToken("<|audio>");
+            int audioEndId = _model.Tokenizer.LookupToken("<audio|>");
+
+            var imageProcessor = model.VisionEncoder != null ? new Gemma4ImageProcessor() : null;
+            int searchFrom = 0;
+
+            foreach (var message in history)
+            {
+                if (message.ImagePaths != null && model.VisionEncoder != null)
+                {
+                    foreach (var imagePath in message.ImagePaths)
+                    {
+                        var (pixels, imageWidth, imageHeight) = imageProcessor.ProcessImage(imagePath);
+                        var embeddings = model.VisionEncoder.Encode(pixels, imageWidth, imageHeight);
+                        int tokenCount = (int)embeddings.Sizes[0];
+                        int tokenPosition = FindTokenPosition(inputTokens, imageStartId, searchFrom);
+
+                        if (tokenPosition >= 0)
+                        {
+                            inputTokens = ExpandSingleTokenPlaceholder(inputTokens, tokenPosition, imageStartId, tokenCount, imageEndId);
+                            model.SetVisionEmbeddings(embeddings, tokenPosition + 1);
+                            searchFrom = tokenPosition + tokenCount + 2;
+                        }
+                        else
+                        {
+                            embeddings.Dispose();
+                        }
+                    }
+                }
+
+                if (message.AudioPaths != null && model.AudioEncoder != null && audioStartId >= 0 && audioEndId >= 0)
+                {
+                    foreach (var audioPath in message.AudioPaths)
+                    {
+                        float[] samples = Gemma4AudioPreprocessor.DecodeAudioFile(audioPath);
+                        if (samples.Length % 128 != 0)
+                        {
+                            int padded = samples.Length + (128 - samples.Length % 128);
+                            Array.Resize(ref samples, padded);
+                        }
+
+                        var (melData, numFrames) = Gemma4AudioPreprocessor.ComputeMelSpectrogram(samples);
+                        if (melData == null || numFrames == 0)
+                            continue;
+
+                        var embeddings = model.AudioEncoder.Encode(melData, numFrames);
+                        int tokenCount = (int)embeddings.Sizes[0];
+                        int tokenPosition = FindTokenPosition(inputTokens, audioStartId, searchFrom);
+
+                        if (tokenPosition >= 0)
+                        {
+                            inputTokens = ExpandSingleTokenPlaceholder(inputTokens, tokenPosition, audioStartId, tokenCount, audioEndId);
+                            model.SetAudioEmbeddings(embeddings, tokenPosition + 1);
+                            searchFrom = tokenPosition + tokenCount + 2;
+                        }
+                        else
+                        {
+                            embeddings.Dispose();
+                        }
+                    }
+                }
+            }
+
+            return inputTokens;
+        }
+
+        private List<int> ProcessGemma3History(Gemma3Model model, List<ChatMessage> history, List<int> inputTokens)
+        {
+            if (model.VisionEncoder == null)
+                return inputTokens;
+
+            var imagePaths = GetImagePathsInPromptOrder(history);
+            if (imagePaths.Count == 0)
+                return inputTokens;
+
+            var processor = new Gemma3ImageProcessor();
+            int startId = _model.Tokenizer.LookupToken("<start_of_image>");
+            if (startId < 0) startId = Gemma3ImageProcessor.StartOfImageToken;
+            int endId = Gemma3ImageProcessor.EndOfImageToken;
+            int newlineId = Gemma3ImageProcessor.NewlineNewlineToken;
+            int padId = Gemma3ImageProcessor.PadToken;
+
+            inputTokens = ChatTemplate.ExpandGemma3ImageTokens(
+                inputTokens,
+                startId,
+                endId,
+                newlineId,
+                padId,
+                processor.TokensPerImage);
+
+            int searchFrom = 0;
+            foreach (var imagePath in imagePaths)
+            {
+                float[] pixels = processor.ProcessImage(imagePath);
+                var embeddings = model.VisionEncoder.Encode(pixels);
+                int tokenStart = FindGemma3ImageInsertPosition(inputTokens, startId, padId, searchFrom);
+
+                if (tokenStart >= 0)
+                {
+                    model.SetVisionEmbeddings(embeddings, tokenStart);
+                    searchFrom = tokenStart + processor.TokensPerImage + 2;
+                }
+                else
+                {
+                    embeddings.Dispose();
+                }
+            }
+
+            return inputTokens;
+        }
+
+        private List<int> ProcessQwen35History(Qwen35Model model, List<ChatMessage> history, List<int> inputTokens)
+        {
+            if (model.VisionEncoder == null)
+                return inputTokens;
+
+            var imagePaths = GetImagePathsInPromptOrder(history);
+            if (imagePaths.Count == 0)
+                return inputTokens;
+
+            int imagePadId = _model.Tokenizer.LookupToken("<|image_pad|>");
+            if (imagePadId < 0)
+                return inputTokens;
+
+            var processor = new Qwen35ImageProcessor(model.VisionEncoder.PatchSize, model.VisionEncoder.SpatialMergeSize);
+            var tokenCounts = new int[imagePaths.Count];
+            for (int i = 0; i < imagePaths.Count; i++)
+            {
+                var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(imagePaths[i]);
+                tokenCounts[i] = processor.ComputeImageTokenCount(height, width);
+            }
+
+            inputTokens = ChatTemplate.ExpandImageTokens(inputTokens, imagePadId, tokenCounts);
+
+            int searchFrom = 0;
+            for (int i = 0; i < imagePaths.Count; i++)
+            {
+                var (pixels, resizedHeight, resizedWidth) = processor.ProcessImage(imagePaths[i]);
+                var embeddings = model.VisionEncoder.Encode(pixels, resizedHeight, resizedWidth);
+                int tokenStart = FindTokenPosition(inputTokens, imagePadId, searchFrom);
+
+                if (tokenStart >= 0)
+                {
+                    model.SetVisionEmbeddings(embeddings, tokenStart);
+                    searchFrom = tokenStart + tokenCounts[i];
+                }
+                else
+                {
+                    embeddings.Dispose();
+                }
+            }
+
+            return inputTokens;
+        }
+
+        private List<int> ProcessMistral3History(Mistral3Model model, List<ChatMessage> history, List<int> inputTokens)
+        {
+            if (model.VisionEncoder == null)
+                return inputTokens;
+
+            var imagePaths = GetImagePathsInPromptOrder(history);
+            if (imagePaths.Count == 0)
+                return inputTokens;
+
+            var processor = new Mistral3ImageProcessor(
+                model.VisionEncoder.ImageSize,
+                model.VisionEncoder.PatchSize);
+
+            int searchFrom = 0;
+            foreach (var imagePath in imagePaths)
+            {
+                var (pixels, imgW, imgH) = processor.ProcessImage(imagePath);
+                var embeddings = model.VisionEncoder.Encode(pixels, imgW, imgH);
+                int numRows = imgH / model.VisionEncoder.PatchSize / model.VisionEncoder.SpatialMergeSize;
+                int numCols = imgW / model.VisionEncoder.PatchSize / model.VisionEncoder.SpatialMergeSize;
+
+                // Find [IMG] token position
+                int tokenPosition = FindTokenPosition(inputTokens, Mistral3ImageProcessor.ImgTokenId, searchFrom);
+                if (tokenPosition < 0)
+                {
+                    embeddings.Dispose();
+                    continue;
+                }
+
+                // Expand: for each row, insert numCols [IMG] tokens, then [IMG_BREAK] or [IMG_END]
+                var expanded = new List<int>();
+                for (int i = 0; i < tokenPosition; i++)
+                    expanded.Add(inputTokens[i]);
+
+                int embedOffset = 0;
+                for (int row = 0; row < numRows; row++)
+                {
+                    for (int col = 0; col < numCols; col++)
+                    {
+                        expanded.Add(Mistral3ImageProcessor.ImgTokenId);
+                        embedOffset++;
+                    }
+                    if (row == numRows - 1)
+                        expanded.Add(Mistral3ImageProcessor.ImgEndTokenId);
+                    else
+                        expanded.Add(Mistral3ImageProcessor.ImgBreakTokenId);
+                }
+
+                for (int i = tokenPosition + 1; i < inputTokens.Count; i++)
+                    expanded.Add(inputTokens[i]);
+
+                model.SetVisionEmbeddings(embeddings, tokenPosition);
+                inputTokens = expanded;
+                searchFrom = tokenPosition + numRows * numCols + numRows;
+            }
+
+            return inputTokens;
+        }
+
+        private static List<string> GetImagePathsInPromptOrder(List<ChatMessage> history)
+        {
+            var imagePaths = new List<string>();
+            if (history == null)
+                return imagePaths;
+
+            foreach (var message in history)
+            {
+                if (message.ImagePaths == null)
+                    continue;
+
+                foreach (var path in message.ImagePaths)
+                {
+                    if (!string.IsNullOrEmpty(path))
+                        imagePaths.Add(path);
+                }
+            }
+
+            return imagePaths;
+        }
+
+        private static List<int> ExpandSingleTokenPlaceholder(
+            List<int> inputTokens, int tokenPosition, int startTokenId, int expandedTokenCount, int endTokenId)
+        {
+            var expanded = new List<int>(inputTokens.Count + expandedTokenCount + 1);
+            for (int i = 0; i < tokenPosition; i++)
+                expanded.Add(inputTokens[i]);
+            expanded.Add(startTokenId);
+            for (int i = 0; i < expandedTokenCount; i++)
+                expanded.Add(0);
+            expanded.Add(endTokenId);
+            for (int i = tokenPosition + 1; i < inputTokens.Count; i++)
+                expanded.Add(inputTokens[i]);
+            return expanded;
+        }
+
+        private static int FindTokenPosition(List<int> tokens, int tokenId, int searchFrom)
+        {
+            for (int i = Math.Max(0, searchFrom); i < tokens.Count; i++)
+            {
+                if (tokens[i] == tokenId)
+                    return i;
+            }
+
+            return -1;
+        }
+
+        private static int FindGemma3ImageInsertPosition(List<int> tokens, int startTokenId, int padTokenId, int searchFrom)
+        {
+            for (int i = Math.Max(0, searchFrom); i + 1 < tokens.Count; i++)
+            {
+                if (tokens[i] == startTokenId && tokens[i + 1] == padTokenId)
+                    return i + 1;
+            }
+
+            return -1;
+        }
+    }
+}
+
diff --git a/InferenceEngine/Modelfile.fp16 b/TensorSharp.Models/Modelfile.fp16
similarity index 100%
rename from InferenceEngine/Modelfile.fp16
rename to TensorSharp.Models/Modelfile.fp16
diff --git a/InferenceEngine/Models/Gemma3/Gemma3Model.cs b/TensorSharp.Models/Models/Gemma3/Gemma3Model.cs
similarity index 99%
rename from InferenceEngine/Models/Gemma3/Gemma3Model.cs
rename to TensorSharp.Models/Models/Gemma3/Gemma3Model.cs
index 1d73df4..4077572 100644
--- a/InferenceEngine/Models/Gemma3/Gemma3Model.cs
+++ b/TensorSharp.Models/Models/Gemma3/Gemma3Model.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using TensorSharp;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     /// <summary>
     /// Gemma 3 model architecture.
@@ -111,6 +111,7 @@ private void PrecomputeRoPE()
 
         private void InitKVCache(int maxSeqLen)
         {
+            _maxContextLength = maxSeqLen;
             _kvCacheK = new Tensor[Config.NumLayers];
             _kvCacheV = new Tensor[Config.NumLayers];
 
@@ -669,3 +670,4 @@ public override void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs b/TensorSharp.Models/Models/Gemma3/Gemma3VisionEncoder.cs
similarity index 99%
rename from InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
rename to TensorSharp.Models/Models/Gemma3/Gemma3VisionEncoder.cs
index 389744b..8785950 100644
--- a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs
+++ b/TensorSharp.Models/Models/Gemma3/Gemma3VisionEncoder.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using TensorSharp.Cpu;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Gemma3VisionEncoder : IDisposable
     {
@@ -416,3 +416,4 @@ public void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma3/ImageProcessor.cs b/TensorSharp.Models/Models/Gemma3/ImageProcessor.cs
similarity index 99%
rename from InferenceEngine/Models/Gemma3/ImageProcessor.cs
rename to TensorSharp.Models/Models/Gemma3/ImageProcessor.cs
index bfea928..1cc0243 100644
--- a/InferenceEngine/Models/Gemma3/ImageProcessor.cs
+++ b/TensorSharp.Models/Models/Gemma3/ImageProcessor.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -12,7 +12,7 @@
 using System.Threading.Tasks;
 using StbImageSharp;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Gemma3ImageProcessor
     {
@@ -391,3 +391,4 @@ private float[] PackChannelFirst(byte[] rgba, int width, int height)
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs b/TensorSharp.Models/Models/Gemma4/Gemma4AudioEncoder.cs
similarity index 99%
rename from InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs
rename to TensorSharp.Models/Models/Gemma4/Gemma4AudioEncoder.cs
index 51e5280..f5b9435 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs
+++ b/TensorSharp.Models/Models/Gemma4/Gemma4AudioEncoder.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using TensorSharp.Cpu;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Gemma4AudioEncoder : IDisposable
     {
@@ -892,3 +892,4 @@ public void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs b/TensorSharp.Models/Models/Gemma4/Gemma4AudioPreprocessor.cs
similarity index 99%
rename from InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs
rename to TensorSharp.Models/Models/Gemma4/Gemma4AudioPreprocessor.cs
index 325d53e..661232f 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs
+++ b/TensorSharp.Models/Models/Gemma4/Gemma4AudioPreprocessor.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -14,7 +14,7 @@
 using NLayer;
 using NVorbis;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Gemma4AudioPreprocessor
     {
@@ -363,3 +363,4 @@ private static double[] BuildWindow()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs b/TensorSharp.Models/Models/Gemma4/Gemma4ImageProcessor.cs
similarity index 97%
rename from InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs
rename to TensorSharp.Models/Models/Gemma4/Gemma4ImageProcessor.cs
index 840abda..a3d759c 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs
+++ b/TensorSharp.Models/Models/Gemma4/Gemma4ImageProcessor.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -10,7 +10,7 @@
 using System;
 using System.IO;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Gemma4ImageProcessor
     {
@@ -96,3 +96,4 @@ public int ComputeOutputTokens(int imageWidth, int imageHeight)
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma4/Gemma4Model.cs b/TensorSharp.Models/Models/Gemma4/Gemma4Model.cs
similarity index 97%
rename from InferenceEngine/Models/Gemma4/Gemma4Model.cs
rename to TensorSharp.Models/Models/Gemma4/Gemma4Model.cs
index 90c5b10..b88d477 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4Model.cs
+++ b/TensorSharp.Models/Models/Gemma4/Gemma4Model.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -14,7 +14,7 @@
 using TensorSharp;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     /// <summary>
     /// Gemma 4 model architecture.
@@ -320,6 +320,7 @@ private void PrecomputeRoPE()
 
         private void InitKVCache(int maxSeqLen)
         {
+            _maxContextLength = maxSeqLen;
             _kvCacheK = new Tensor[Config.NumLayers];
             _kvCacheV = new Tensor[Config.NumLayers];
             _kvCacheSize = new int[Config.NumLayers];
@@ -400,10 +401,16 @@ public override float[] Forward(int[] tokens)
 
             ScaleEmbedding(hidden);
 
+            HashSet<int> exceptPositions = null;
+
             if (_pendingVisionEmbeddingsList.Count > 0)
             {
+                exceptPositions = new HashSet<int>();
                 foreach (var (emb, pos) in _pendingVisionEmbeddingsList)
                 {
+                    int numTokens = (int)emb.Sizes[0];
+                    for (int i = 0; i < numTokens; i++)
+                        exceptPositions.Add(pos + i);
                     InjectVisionEmbeddings(hidden, emb, pos);
                     emb.Dispose();
                 }
@@ -412,8 +419,12 @@ public override float[] Forward(int[] tokens)
 
             if (_pendingAudioEmbeddingsList.Count > 0)
             {
+                exceptPositions ??= new HashSet<int>();
                 foreach (var (emb, pos) in _pendingAudioEmbeddingsList)
                 {
+                    int numTokens = (int)emb.Sizes[0];
+                    for (int i = 0; i < numTokens; i++)
+                        exceptPositions.Add(pos + i);
                     InjectVisionEmbeddings(hidden, emb, pos);
                     emb.Dispose();
                 }
@@ -442,7 +453,7 @@ public override float[] Forward(int[] tokens)
                         perLayerInput = ExtractPerLayerSlice(perLayerInputs, l, seqLen);
 
                     bool isShared = _kvDonorMap.ContainsKey(l);
-                    hidden = TransformerBlock(hidden, l, seqLen, startPos, isShared, perLayerInput);
+                    hidden = TransformerBlock(hidden, l, seqLen, startPos, isShared, perLayerInput, exceptPositions);
 
                     perLayerInput?.Dispose();
                 }
@@ -901,13 +912,13 @@ private bool HasMoE(int layer)
         }
 
         private Tensor TransformerBlock(Tensor hidden, int layer, int seqLen, int startPos,
-            bool isShared, Tensor perLayerInput)
+            bool isShared, Tensor perLayerInput, HashSet<int> exceptPositions = null)
         {
             string prefix = $"blk.{layer}";
 
             using var attnNormed = RMSNormOp(hidden, $"{prefix}.attn_norm.weight");
 
-            using var attnOut = Attention(attnNormed, layer, prefix, seqLen, startPos, isShared);
+            using var attnOut = Attention(attnNormed, layer, prefix, seqLen, startPos, isShared, exceptPositions);
 
             using var postAttnNormed = RMSNormOp(attnOut, $"{prefix}.post_attention_norm.weight");
 
@@ -1305,7 +1316,7 @@ private Tensor FFNGelu(Tensor input, string gateUpWeightName, string downWeightN
 
         #region Attention
 
-        private Tensor Attention(Tensor input, int layer, string prefix, int seqLen, int startPos, bool isShared)
+        private Tensor Attention(Tensor input, int layer, string prefix, int seqLen, int startPos, bool isShared, HashSet<int> exceptPositions = null)
         {
             long t0 = Stopwatch.GetTimestamp();
             bool isLocal = IsLocalLayer(layer);
@@ -1504,7 +1515,8 @@ private Tensor Attention(Tensor input, int layer, string prefix, int seqLen, int
                 kExpanded.Dispose();
 
                 int windowSize = isLocal ? _slidingWindow : 0;
-                ApplyCausalMask(scores, seqLen, kvLen, windowSize);
+                HashSet<int> maskExcept = isLocal ? null : exceptPositions;
+                ApplyCausalMask(scores, seqLen, kvLen, windowSize, maskExcept);
                 Ops.Softmax(scores, scores);
 
                 var attnOut = new Tensor(_allocator, DType.Float32, Config.NumHeads, seqLen, hd);
@@ -1756,10 +1768,38 @@ private unsafe void CopyToCacheCircular(Tensor cache, Tensor src, int startPos,
             InvalidateTensorDeviceCache(cache);
         }
 
-        private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, int windowSize)
+        private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, int windowSize,
+            HashSet<int> exceptPositions = null)
         {
             int startPos = totalKVLen - queryLen;
-            Ops.AddCausalMask(scores, queryLen, startPos, float.NegativeInfinity);
+
+            if (exceptPositions != null && exceptPositions.Count > 0)
+            {
+                float* sPtr = GetFloatPtr(scores);
+                int numHeads = (int)scores.Sizes[0];
+                int rowStride = queryLen * totalKVLen;
+
+                for (int h = 0; h < numHeads; h++)
+                {
+                    float* headScores = sPtr + h * rowStride;
+                    for (int q = 0; q < queryLen; q++)
+                    {
+                        int queryAbsPos = startPos + q;
+                        bool queryIsExcept = exceptPositions.Contains(queryAbsPos);
+                        float* row = headScores + q * totalKVLen;
+                        for (int kv = queryAbsPos + 1; kv < totalKVLen; kv++)
+                        {
+                            if (!queryIsExcept && !exceptPositions.Contains(kv))
+                                row[kv] = float.NegativeInfinity;
+                        }
+                    }
+                }
+                InvalidateTensorDeviceCache(scores);
+            }
+            else
+            {
+                Ops.AddCausalMask(scores, queryLen, startPos, float.NegativeInfinity);
+            }
 
             if (windowSize > 0)
             {
@@ -1782,6 +1822,7 @@ private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen,
                         }
                     }
                 }
+                InvalidateTensorDeviceCache(scores);
             }
         }
 
@@ -1814,3 +1855,4 @@ public override void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs b/TensorSharp.Models/Models/Gemma4/Gemma4VisionEncoder.cs
similarity index 97%
rename from InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs
rename to TensorSharp.Models/Models/Gemma4/Gemma4VisionEncoder.cs
index 304529f..4e7e88d 100644
--- a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs
+++ b/TensorSharp.Models/Models/Gemma4/Gemma4VisionEncoder.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using TensorSharp.Cpu;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Gemma4VisionEncoder : IDisposable
     {
@@ -241,6 +241,10 @@ private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int hea
             var result = new Tensor(_allocator, DType.Float32, postAttnNormed.Sizes);
             Ops.Add(result, postAttnNormed, postFfnNormed);
 
+            string scaleKey = $"v.blk.{blockIdx}.out_scale.weight";
+            if (_weights.TryGetValue(scaleKey, out var scaleTensor))
+                Ops.Mul(result, result, scaleTensor);
+
             return result;
         }
 
@@ -461,6 +465,14 @@ private unsafe Tensor PoolAndProject(Tensor visionOutput, int patchesX, int patc
             float scale = MathF.Sqrt(_hiddenSize);
             Ops.Mul(pooled, pooled, scale);
 
+            // Vision standardization before projection (matches Ollama)
+            if (_weights.TryGetValue("v.std_bias", out var stdBias) &&
+                _weights.TryGetValue("v.std_scale", out var stdScale))
+            {
+                Ops.Sub(pooled, pooled, stdBias);
+                Ops.Mul(pooled, pooled, stdScale);
+            }
+
             // Project to text dimension + unweighted RMSNorm
             var projected = LinearProjection(pooled, "mm.input_projection.weight");
             pooled.Dispose();
@@ -579,3 +591,4 @@ public void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/GptOss/GptOssModel.cs b/TensorSharp.Models/Models/GptOss/GptOssModel.cs
similarity index 99%
rename from InferenceEngine/Models/GptOss/GptOssModel.cs
rename to TensorSharp.Models/Models/GptOss/GptOssModel.cs
index 1471463..03a02f6 100644
--- a/InferenceEngine/Models/GptOss/GptOssModel.cs
+++ b/TensorSharp.Models/Models/GptOss/GptOssModel.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -15,7 +15,7 @@
 using TensorSharp;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     /// <summary>
     /// GPT OSS (Mixture-of-Experts) transformer model.
@@ -337,6 +337,7 @@ private void PrecomputeConstants()
 
         private void InitKVCache(int maxSeqLen)
         {
+            _maxContextLength = maxSeqLen;
             int numKVHeads = Config.NumKVHeads;
             int headDim = Config.HeadDim;
             _kvCacheK = new Tensor[Config.NumLayers];
@@ -1014,3 +1015,4 @@ public override void Dispose()
         }
     }
 }
+
diff --git a/TensorSharp.Models/Models/Mistral3/Mistral3ImageProcessor.cs b/TensorSharp.Models/Models/Mistral3/Mistral3ImageProcessor.cs
new file mode 100644
index 0000000..131b410
--- /dev/null
+++ b/TensorSharp.Models/Models/Mistral3/Mistral3ImageProcessor.cs
@@ -0,0 +1,148 @@
+// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.IO;
+using System.Threading.Tasks;
+
+namespace TensorSharp.Models
+{
+    /// <summary>
+    /// Image processor for Mistral 3 / Pixtral models.
+    /// Processing pipeline:
+    /// 1. Composite transparent images over white background
+    /// 2. Resize to fit longest_edge while preserving aspect ratio
+    /// 3. Pad to be divisible by patch_size
+    /// 4. Normalize with CLIP default mean/std
+    /// </summary>
+    public class Mistral3ImageProcessor
+    {
+        public int ImageSize { get; }
+        public int PatchSize { get; }
+        public int NumChannels { get; }
+        public int LongestEdge { get; }
+
+        // CLIP default normalization parameters
+        private static readonly float[] ClipMean = { 0.48145466f, 0.4578275f, 0.40821073f };
+        private static readonly float[] ClipStd = { 0.26862954f, 0.26130258f, 0.27577711f };
+
+        // Special token IDs for Mistral 3 vision
+        public const int ImgTokenId = 10;
+        public const int ImgBreakTokenId = 12;
+        public const int ImgEndTokenId = 13;
+
+        public Mistral3ImageProcessor(int imageSize = 1540, int patchSize = 14,
+            int numChannels = 3, int longestEdge = 1540)
+        {
+            ImageSize = imageSize;
+            PatchSize = patchSize;
+            NumChannels = numChannels;
+            LongestEdge = longestEdge;
+        }
+
+        /// <summary>
+        /// Process an image file for Mistral 3 vision encoder.
+        /// Returns (pixelValues, finalWidth, finalHeight).
+        /// pixelValues is in channel-first format [C, H, W], normalized with CLIP mean/std.
+        /// </summary>
+        public (float[] pixels, int width, int height) ProcessImage(string imagePath)
+        {
+            byte[] fileBytes = File.ReadAllBytes(imagePath);
+            byte[] rgba = Gemma3ImageProcessor.DecodeImageToRGBA(fileBytes, out int origWidth, out int origHeight);
+
+            // Composite over white background
+            rgba = Gemma3ImageProcessor.CompositeOverWhite(rgba, origWidth, origHeight);
+
+            // Resize to fit longest_edge
+            double ratio = Math.Max((double)origHeight / LongestEdge, (double)origWidth / LongestEdge);
+            int newWidth = origWidth, newHeight = origHeight;
+            if (ratio > 1.0)
+            {
+                newWidth = (int)Math.Floor(origWidth / ratio);
+                newHeight = (int)Math.Floor(origHeight / ratio);
+            }
+
+            // Pad to be divisible by patch_size
+            int patchesX = (newWidth - 1) / PatchSize + 1;
+            int patchesY = (newHeight - 1) / PatchSize + 1;
+            int finalWidth = patchesX * PatchSize;
+            int finalHeight = patchesY * PatchSize;
+
+            // Resize and normalize
+            float[] pixels = ResizeAndNormalize(rgba, origWidth, origHeight, finalWidth, finalHeight);
+
+            Console.WriteLine($"Mistral3 image: {origWidth}x{origHeight} → {finalWidth}x{finalHeight} " +
+                $"({patchesX}x{patchesY} patches)");
+
+            return (pixels, finalWidth, finalHeight);
+        }
+
+        /// <summary>
+        /// Bilinear resize + CLIP normalization in a single pass.
+        /// Output is channel-first: [R..., G..., B...].
+        /// </summary>
+        private float[] ResizeAndNormalize(byte[] rgba, int srcW, int srcH, int dstW, int dstH)
+        {
+            int pixels = dstW * dstH;
+            float[] result = new float[3 * pixels];
+            double xRatio = (double)srcW / dstW;
+            double yRatio = (double)srcH / dstH;
+
+            Parallel.For(0, dstH, dy =>
+            {
+                double srcY = (dy + 0.5) * yRatio - 0.5;
+                int y0 = Math.Max(0, (int)srcY);
+                int y1 = Math.Min(srcH - 1, y0 + 1);
+                double fy = srcY - y0;
+
+                for (int dx = 0; dx < dstW; dx++)
+                {
+                    double srcX = (dx + 0.5) * xRatio - 0.5;
+                    int x0 = Math.Max(0, (int)srcX);
+                    int x1 = Math.Min(srcW - 1, x0 + 1);
+                    double fx = srcX - x0;
+
+                    int dstIdx = dy * dstW + dx;
+
+                    for (int c = 0; c < 3; c++)
+                    {
+                        double v00 = rgba[(y0 * srcW + x0) * 4 + c] / 255.0;
+                        double v01 = rgba[(y0 * srcW + x1) * 4 + c] / 255.0;
+                        double v10 = rgba[(y1 * srcW + x0) * 4 + c] / 255.0;
+                        double v11 = rgba[(y1 * srcW + x1) * 4 + c] / 255.0;
+
+                        double v = v00 * (1 - fx) * (1 - fy) + v01 * fx * (1 - fy) +
+                                   v10 * (1 - fx) * fy + v11 * fx * fy;
+
+                        result[c * pixels + dstIdx] = (float)((v - ClipMean[c]) / ClipStd[c]);
+                    }
+                }
+            });
+
+            return result;
+        }
+
+        /// <summary>
+        /// Compute the number of vision tokens for a processed image.
+        /// After patch merging, tokens = (patchesW / mergeSize) * (patchesH / mergeSize).
+        /// Each row becomes [IMG]...[IMG] tokens, rows separated by [IMG_BREAK], ending with [IMG_END].
+        /// </summary>
+        public int ComputeVisionTokenCount(int imageWidth, int imageHeight, int spatialMergeSize)
+        {
+            int patchesW = imageWidth / PatchSize;
+            int patchesH = imageHeight / PatchSize;
+            int mergedW = patchesW / spatialMergeSize;
+            int mergedH = patchesH / spatialMergeSize;
+
+            // mergedH rows of mergedW [IMG] tokens each
+            // Plus (mergedH - 1) [IMG_BREAK] tokens and 1 [IMG_END] token
+            return mergedW * mergedH + mergedH;
+        }
+    }
+}
diff --git a/TensorSharp.Models/Models/Mistral3/Mistral3Model.cs b/TensorSharp.Models/Models/Mistral3/Mistral3Model.cs
new file mode 100644
index 0000000..5a2435f
--- /dev/null
+++ b/TensorSharp.Models/Models/Mistral3/Mistral3Model.cs
@@ -0,0 +1,632 @@
+// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using TensorSharp;
+using TensorSharp.GGML;
+
+namespace TensorSharp.Models
+{
+    /// <summary>
+    /// Mistral 3 model architecture.
+    /// Key features:
+    /// - Standard LLaMA-like transformer with SiLU-gated MLP (SwiGLU)
+    /// - GPT-J (norm) style RoPE with YaRN scaling for extended context
+    /// - Position-dependent Q scaling: q *= (1 + beta * log(1 + floor(pos / orig_ctx)))
+    /// - No QK-norm (unlike Qwen3/Gemma3)
+    /// - Supports multimodal (vision) via separate Pixtral vision encoder
+    /// </summary>
+    public class Mistral3Model : ModelBase
+    {
+        private Tensor[] _kvCacheK;
+        private Tensor[] _kvCacheV;
+
+        private string[][] _layerWeightNames;
+        private float[] _ropeFreqs;
+        private int _ropeDim;
+        private int _attnKeyLen;
+        private int _attnValLen;
+
+        // YaRN scaling parameters
+        private float _ropeScalingBeta;
+        private int _ropeOrigCtx;
+        private float _ropeExtFactor;
+        private float _ropeBetaFast;
+        private float _ropeBetaSlow;
+        private float _ropeMscale;
+        private float _ropeMscaleAllDim;
+        private string _ropeType;
+
+        // Vision support
+        private Mistral3VisionEncoder _visionEncoder;
+        private List<(Tensor embeddings, int position)> _pendingVisionEmbeddingsList = new();
+
+        public Mistral3Model(string ggufPath, BackendType backend)
+            : base(ggufPath, backend)
+        {
+            string arch = _gguf.GetString("general.architecture") ?? "mistral3";
+            Config = new ModelConfig { Architecture = arch };
+            ParseBaseConfig();
+
+            _attnKeyLen = Config.KeyLength > 0 ? Config.KeyLength : Config.HeadDim;
+            _attnValLen = Config.ValueLength > 0 ? Config.ValueLength : _attnKeyLen;
+            _ropeDim = (int)_gguf.GetUint32($"{arch}.rope.dimension_count", (uint)_attnKeyLen);
+
+            // YaRN parameters
+            _ropeType = _gguf.GetString($"{arch}.rope.scaling.type", "");
+            _ropeScalingBeta = _gguf.GetFloat32($"{arch}.attention.temperature_scale",
+                               _gguf.GetFloat32($"{arch}.rope.scaling_beta", 0.1f));
+            _ropeOrigCtx = (int)_gguf.GetUint32($"{arch}.rope.scaling.original_context_length", 0);
+            Config.OriginalContextLength = _ropeOrigCtx;
+            _ropeExtFactor = _gguf.GetFloat32($"{arch}.rope.scaling.extrapolation_factor", 1.0f);
+            _ropeBetaFast = _gguf.GetFloat32($"{arch}.rope.scaling.yarn_beta_fast",
+                            _gguf.GetFloat32($"{arch}.rope.scaling.beta_fast", 32.0f));
+            _ropeBetaSlow = _gguf.GetFloat32($"{arch}.rope.scaling.yarn_beta_slow",
+                            _gguf.GetFloat32($"{arch}.rope.scaling.beta_slow", 1.0f));
+            _ropeMscale = _gguf.GetFloat32($"{arch}.rope.scaling.mscale", 0f);
+            _ropeMscaleAllDim = _gguf.GetFloat32($"{arch}.rope.scaling.mscale_all_dim", 0f);
+
+            Console.WriteLine($"Model: {arch}, Layers={Config.NumLayers}, Hidden={Config.HiddenSize}, " +
+                $"Heads={Config.NumHeads}, KVHeads={Config.NumKVHeads}, KeyLen={_attnKeyLen}, " +
+                $"ValLen={_attnValLen}, Vocab={Config.VocabSize}");
+            Console.WriteLine($"RoPE base={Config.RopeBase}, scale={Config.RopeScale}, type={_ropeType}, " +
+                $"dim={_ropeDim}, origCtx={_ropeOrigCtx}");
+            if (_ropeType == "yarn")
+                Console.WriteLine($"YaRN beta={_ropeScalingBeta}, betaFast={_ropeBetaFast}, " +
+                    $"betaSlow={_ropeBetaSlow}, extFactor={_ropeExtFactor}");
+
+            ParseTokenizer();
+            LoadWeights();
+            FuseQKVWeights();
+            FuseGateUpWeights();
+
+            int maxCtx = 4096;
+            string ctxEnv = Environment.GetEnvironmentVariable("MAX_CONTEXT");
+            if (!string.IsNullOrEmpty(ctxEnv) && int.TryParse(ctxEnv, out int envCtx) && envCtx > 0)
+                maxCtx = envCtx;
+
+            InitKVCache(maxCtx);
+            PrecomputeConstants();
+        }
+
+        private unsafe void FuseQKVWeights()
+        {
+            int fused = 0;
+            for (int l = 0; l < Config.NumLayers; l++)
+            {
+                string qName = $"blk.{l}.attn_q.weight";
+                string kName = $"blk.{l}.attn_k.weight";
+                string vName = $"blk.{l}.attn_v.weight";
+                string qkvName = $"blk.{l}.attn_qkv.weight";
+
+                if (_quantWeights.TryGetValue(qName, out var qw) &&
+                    _quantWeights.TryGetValue(kName, out var kw) &&
+                    _quantWeights.TryGetValue(vName, out var vw) &&
+                    qw.GgmlType == kw.GgmlType && kw.GgmlType == vw.GgmlType &&
+                    qw.Ne0 == kw.Ne0 && kw.Ne0 == vw.Ne0)
+                {
+                    long totalBytes = qw.RawBytes + kw.RawBytes + vw.RawBytes;
+                    IntPtr fusedPtr = QuantizedWeight.AllocateBuffer(totalBytes);
+                    Buffer.MemoryCopy(qw.Data.ToPointer(), fusedPtr.ToPointer(), totalBytes, qw.RawBytes);
+                    Buffer.MemoryCopy(kw.Data.ToPointer(), (fusedPtr + (int)qw.RawBytes).ToPointer(), totalBytes - qw.RawBytes, kw.RawBytes);
+                    Buffer.MemoryCopy(vw.Data.ToPointer(), (fusedPtr + (int)(qw.RawBytes + kw.RawBytes)).ToPointer(), totalBytes - qw.RawBytes - kw.RawBytes, vw.RawBytes);
+                    _quantWeights[qkvName] = new QuantizedWeight(fusedPtr, totalBytes, qw.GgmlType, qw.Ne0, qw.Ne1 + kw.Ne1 + vw.Ne1);
+                    _quantWeights.Remove(qName); qw.Dispose();
+                    _quantWeights.Remove(kName); kw.Dispose();
+                    _quantWeights.Remove(vName); vw.Dispose();
+                    fused++;
+                }
+                else if (_weights.TryGetValue(qName, out var qf) &&
+                         _weights.TryGetValue(kName, out var kf) &&
+                         _weights.TryGetValue(vName, out var vf))
+                {
+                    int qDim = (int)qf.Sizes[0], kDim = (int)kf.Sizes[0], vDim = (int)vf.Sizes[0];
+                    int inDim = (int)qf.Sizes[1];
+                    var fusedTensor = new Tensor(_allocator, DType.Float32, qDim + kDim + vDim, inDim);
+                    using (var s0 = fusedTensor.Narrow(0, 0, qDim)) Ops.Copy(s0, qf);
+                    using (var s1 = fusedTensor.Narrow(0, qDim, kDim)) Ops.Copy(s1, kf);
+                    using (var s2 = fusedTensor.Narrow(0, qDim + kDim, vDim)) Ops.Copy(s2, vf);
+                    _weights[qkvName] = fusedTensor;
+                    _weights.Remove(qName); qf.Dispose();
+                    _weights.Remove(kName); kf.Dispose();
+                    _weights.Remove(vName); vf.Dispose();
+                    fused++;
+                }
+            }
+            if (fused > 0)
+                Console.WriteLine($"  Fused projections: {fused} QKV");
+        }
+
+        private bool[] _layerQkvFused;
+
+        private void PrecomputeConstants()
+        {
+            int numLayers = Config.NumLayers;
+            _layerQkvFused = new bool[numLayers];
+
+            _layerWeightNames = new string[numLayers][];
+            for (int l = 0; l < numLayers; l++)
+            {
+                string p = $"blk.{l}.";
+                bool fused = _quantWeights.ContainsKey(p + "attn_qkv.weight") ||
+                             _weights.ContainsKey(p + "attn_qkv.weight");
+                _layerQkvFused[l] = fused;
+
+                if (fused)
+                {
+                    _layerWeightNames[l] = new[]
+                    {
+                        p + "attn_norm.weight",      // 0
+                        p + "attn_qkv.weight",        // 1
+                        p + "attn_output.weight",     // 2
+                        p + "ffn_norm.weight",         // 3
+                        p + "ffn_gate_up.weight",      // 4
+                        p + "ffn_down.weight",         // 5
+                    };
+                }
+                else
+                {
+                    _layerWeightNames[l] = new[]
+                    {
+                        p + "attn_norm.weight",      // 0
+                        p + "attn_q.weight",          // 1
+                        p + "attn_k.weight",          // 2
+                        p + "attn_v.weight",          // 3
+                        p + "attn_output.weight",     // 4
+                        p + "ffn_norm.weight",         // 5
+                        p + "ffn_gate_up.weight",      // 6
+                        p + "ffn_down.weight",         // 7
+                    };
+                }
+            }
+
+            int halfDim = _ropeDim / 2;
+            float freqScale = 1.0f / Config.RopeScale;
+            _ropeFreqs = new float[halfDim];
+            for (int i = 0; i < halfDim; i++)
+                _ropeFreqs[i] = freqScale / MathF.Pow(Config.RopeBase, (2.0f * i) / _ropeDim);
+
+            if (_ropeType == "yarn" && _ropeOrigCtx > 0)
+                ApplyYarnFreqCorrection(_ropeFreqs, halfDim);
+        }
+
+        /// <summary>
+        /// Apply YaRN frequency correction to precomputed RoPE frequencies for decode path.
+        /// Interpolates between extrapolated and interpolated frequencies based on
+        /// whether each frequency band is within the "slow" or "fast" rotation range.
+        /// </summary>
+        private void ApplyYarnFreqCorrection(float[] freqs, int halfDim)
+        {
+            float lowFreqWavelen = (float)(_ropeOrigCtx / _ropeBetaSlow);
+            float highFreqWavelen = (float)(_ropeOrigCtx / _ropeBetaFast);
+
+            for (int i = 0; i < halfDim; i++)
+            {
+                float origFreq = 1.0f / MathF.Pow(Config.RopeBase, (2.0f * i) / _ropeDim);
+                float wavelen = 2.0f * MathF.PI / origFreq;
+
+                if (wavelen < highFreqWavelen)
+                {
+                    // High frequency: use original frequency (extrapolation)
+                    freqs[i] = origFreq;
+                }
+                else if (wavelen > lowFreqWavelen)
+                {
+                    // Low frequency: use interpolated frequency
+                    freqs[i] = origFreq / Config.RopeScale;
+                }
+                else
+                {
+                    // Intermediate: smooth blend between interpolated and extrapolated
+                    float smooth = (lowFreqWavelen / wavelen - 1.0f) /
+                                   (lowFreqWavelen / highFreqWavelen - 1.0f);
+                    float interpFreq = origFreq / Config.RopeScale;
+                    freqs[i] = (1.0f - smooth) * interpFreq + smooth * origFreq;
+                }
+            }
+        }
+
+        private void InitKVCache(int maxSeqLen)
+        {
+            _maxContextLength = maxSeqLen;
+            int numKVHeads = Config.NumKVHeads;
+            _kvCacheK = new Tensor[Config.NumLayers];
+            _kvCacheV = new Tensor[Config.NumLayers];
+            for (int l = 0; l < Config.NumLayers; l++)
+            {
+                _kvCacheK[l] = new Tensor(_allocator, DType.Float32, numKVHeads, maxSeqLen, _attnKeyLen);
+                _kvCacheV[l] = new Tensor(_allocator, DType.Float32, numKVHeads, maxSeqLen, _attnValLen);
+                Ops.Fill(_kvCacheK[l], 0);
+                Ops.Fill(_kvCacheV[l], 0);
+            }
+            _cacheSeqLen = 0;
+        }
+
+        public override void ResetKVCache()
+        {
+            for (int l = 0; l < Config.NumLayers; l++)
+            {
+                Ops.Fill(_kvCacheK[l], 0);
+                Ops.Fill(_kvCacheV[l], 0);
+                InvalidateTensorDeviceCache(_kvCacheK[l]);
+                InvalidateTensorDeviceCache(_kvCacheV[l]);
+            }
+            _cacheSeqLen = 0;
+            _linearTicks = _attnTicks = _normTicks = _embTicks = _lmHeadTicks = _logitsCopyTicks = 0;
+            _forwardCount = 0;
+            _forwardSw.Reset();
+        }
+
+        public override void TruncateKVCache(int tokenCount)
+        {
+            base.TruncateKVCache(tokenCount);
+            for (int l = 0; l < Config.NumLayers; l++)
+            {
+                InvalidateTensorDeviceCache(_kvCacheK[l]);
+                InvalidateTensorDeviceCache(_kvCacheV[l]);
+            }
+        }
+
+        // Vision support
+        public void LoadVisionEncoder(string mmProjPath)
+        {
+            _visionEncoder = new Mistral3VisionEncoder(mmProjPath, _allocator);
+        }
+
+        public void SetVisionEmbeddings(Tensor embeddings, int insertPosition)
+        {
+            _pendingVisionEmbeddingsList.Add((embeddings, insertPosition));
+        }
+
+        public Mistral3VisionEncoder VisionEncoder => _visionEncoder;
+
+        public override float[] Forward(int[] tokens)
+        {
+            _forwardSw.Start();
+            int seqLen = tokens.Length;
+            int startPos = _cacheSeqLen;
+
+            long t1 = Stopwatch.GetTimestamp();
+            Tensor hidden = Embedding(tokens);
+            _embTicks += Stopwatch.GetTimestamp() - t1;
+
+            if (_pendingVisionEmbeddingsList.Count > 0)
+            {
+                foreach (var (embeddings, position) in _pendingVisionEmbeddingsList)
+                {
+                    InjectVisionEmbeddings(hidden, embeddings, position);
+                    embeddings.Dispose();
+                }
+                _pendingVisionEmbeddingsList.Clear();
+            }
+
+            for (int layer = 0; layer < Config.NumLayers; layer++)
+            {
+                hidden = TransformerBlock(hidden, layer, seqLen, startPos);
+            }
+
+            Tensor normed = RMSNormOp(hidden, "output_norm.weight");
+            hidden.Dispose();
+
+            Tensor lastHidden;
+            if (seqLen > 1)
+            {
+                using var narrowed = normed.Narrow(0, seqLen - 1, 1);
+                lastHidden = Ops.NewContiguous(narrowed);
+            }
+            else
+            {
+                lastHidden = normed.CopyRef();
+            }
+            normed.Dispose();
+
+            long t2 = Stopwatch.GetTimestamp();
+            Tensor logitsTensor = LinearForward(lastHidden, "output.weight");
+            if (logitsTensor == null)
+                logitsTensor = LinearForward(lastHidden, "token_embd.weight");
+            _lmHeadTicks += Stopwatch.GetTimestamp() - t2;
+            lastHidden.Dispose();
+
+            long t3 = Stopwatch.GetTimestamp();
+            _logitsBuffer = TensorToFloatArray(logitsTensor);
+            _logitsCopyTicks += Stopwatch.GetTimestamp() - t3;
+            logitsTensor.Dispose();
+
+            _cacheSeqLen += seqLen;
+            _forwardCount++;
+            _forwardSw.Stop();
+            return _logitsBuffer;
+        }
+
+        private unsafe void InjectVisionEmbeddings(Tensor hidden, Tensor visionEmbeddings, int insertPos)
+        {
+            int numVisionTokens = (int)visionEmbeddings.Sizes[0];
+            int dim = Config.HiddenSize;
+            float* hPtr = GetFloatPtr(hidden);
+            float* vPtr = GetFloatPtr(visionEmbeddings);
+
+            for (int t = 0; t < numVisionTokens; t++)
+            {
+                float* dst = hPtr + (long)(insertPos + t) * dim;
+                float* src = vPtr + (long)t * dim;
+                Buffer.MemoryCopy(src, dst, dim * sizeof(float), dim * sizeof(float));
+            }
+
+            Console.WriteLine($"Injected {numVisionTokens} vision tokens at position {insertPos}");
+        }
+
+        private Tensor TransformerBlock(Tensor hidden, int layer, int seqLen, int startPos)
+        {
+            string[] wn = _layerWeightNames[layer];
+
+            bool fused = _layerQkvFused[layer];
+            int normIdx = 0;
+            int ffnNormIdx = fused ? 3 : 5;
+            int gateUpIdx = fused ? 4 : 6;
+            int downIdx = fused ? 5 : 7;
+
+            Tensor normed = RMSNormOp(hidden, wn[normIdx]);
+            Tensor attnOut = Attention(normed, layer, wn, seqLen, startPos);
+            normed.Dispose();
+
+            Ops.Add(hidden, hidden, attnOut);
+            attnOut.Dispose();
+
+            Tensor normed2 = RMSNormOp(hidden, wn[ffnNormIdx]);
+            Tensor ffnOut = FFN(normed2, wn[gateUpIdx], wn[downIdx], seqLen);
+            normed2.Dispose();
+
+            Ops.Add(hidden, hidden, ffnOut);
+            ffnOut.Dispose();
+
+            return hidden;
+        }
+
+        private Tensor Attention(Tensor input, int layer, string[] wn, int seqLen, int startPos)
+        {
+            int numHeads = Config.NumHeads;
+            int numKVHeads = Config.NumKVHeads;
+            int headDim = _attnKeyLen;
+            int qDim = numHeads * headDim;
+            int kDim = numKVHeads * headDim;
+            int totalSeqLen = startPos + seqLen;
+            float scale = 1.0f / MathF.Sqrt(headDim);
+
+            Tensor qTensor, kTensor, vTensor;
+
+            bool layerFused = _layerQkvFused[layer];
+            if (layerFused)
+            {
+                Tensor qkvFused = LinearForward(input, wn[1]);
+                if (seqLen == 1)
+                {
+                    qTensor = qkvFused.Narrow(1, 0, qDim);
+                    kTensor = qkvFused.Narrow(1, qDim, kDim);
+                    vTensor = qkvFused.Narrow(1, qDim + kDim, kDim);
+                    qkvFused.Dispose();
+                }
+                else
+                {
+                    using (var qView = qkvFused.Narrow(1, 0, qDim))
+                        qTensor = Ops.NewContiguous(qView);
+                    using (var kView = qkvFused.Narrow(1, qDim, kDim))
+                        kTensor = Ops.NewContiguous(kView);
+                    using (var vView = qkvFused.Narrow(1, qDim + kDim, kDim))
+                        vTensor = Ops.NewContiguous(vView);
+                    qkvFused.Dispose();
+                }
+            }
+            else
+            {
+                qTensor = LinearForward(input, wn[1]);  // attn_q
+                kTensor = LinearForward(input, wn[2]);  // attn_k
+                vTensor = LinearForward(input, wn[3]);  // attn_v
+            }
+
+            if (seqLen == 1)
+            {
+                ApplyRoPEDecode(qTensor, numHeads, headDim, startPos);
+                ApplyRoPEDecode(kTensor, numKVHeads, headDim, startPos);
+
+                // Position-dependent Q scaling for YaRN
+                if (_ropeOrigCtx > 0)
+                    ApplyPositionScale(qTensor, numHeads * headDim, startPos);
+            }
+            else
+            {
+                qTensor = ApplyRoPEPrefill(qTensor, numHeads, headDim, seqLen, startPos);
+                kTensor = ApplyRoPEPrefill(kTensor, numKVHeads, headDim, seqLen, startPos);
+
+                // Position-dependent Q scaling for YaRN
+                if (_ropeOrigCtx > 0)
+                    ApplyPositionScalePrefill(qTensor, numHeads, headDim, seqLen, startPos);
+            }
+
+            long t0 = Stopwatch.GetTimestamp();
+
+            if (seqLen == 1)
+            {
+                CopyToCacheDecode(_kvCacheK[layer], kTensor, _kvCacheV[layer], vTensor,
+                    numKVHeads, headDim, startPos);
+                kTensor.Dispose();
+                vTensor.Dispose();
+
+                var attnResult = new Tensor(_allocator, DType.Float32, 1, numHeads * headDim);
+                AttentionDecodePureCS(qTensor, _kvCacheK[layer], _kvCacheV[layer],
+                    attnResult, numHeads, numKVHeads, headDim, totalSeqLen, scale);
+                qTensor.Dispose();
+
+                _attnTicks += Stopwatch.GetTimestamp() - t0;
+
+                int outputIdx = layerFused ? 2 : 4;
+                Tensor decodeOut = LinearForward(attnResult, wn[outputIdx]);
+                attnResult.Dispose();
+                return decodeOut;
+            }
+
+            Tensor qHeads = ReshapeToHeads(qTensor, numHeads, seqLen, headDim);
+            qTensor.Dispose();
+            Tensor kHeads = ReshapeToHeads(kTensor, numKVHeads, seqLen, headDim);
+            kTensor.Dispose();
+            Tensor vHeads = ReshapeToHeads(vTensor, numKVHeads, seqLen, _attnValLen);
+            vTensor.Dispose();
+
+            CopyToCache(_kvCacheK[layer], kHeads, startPos, seqLen);
+            CopyToCache(_kvCacheV[layer], vHeads, startPos, seqLen);
+            kHeads.Dispose();
+            vHeads.Dispose();
+
+            int groupSize = numHeads / numKVHeads;
+            Tensor kExpanded = ExpandKVHeads(_kvCacheK[layer], groupSize, totalSeqLen);
+            Tensor vExpanded = ExpandKVHeads(_kvCacheV[layer], groupSize, totalSeqLen);
+
+            using var kT = kExpanded.Transpose(1, 2);
+            var scores = new Tensor(_allocator, DType.Float32, numHeads, seqLen, totalSeqLen);
+            Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT);
+            qHeads.Dispose();
+            kExpanded.Dispose();
+
+            Ops.AddCausalMask(scores, seqLen, startPos, float.NegativeInfinity);
+            Ops.Softmax(scores, scores);
+
+            var attnOut = new Tensor(_allocator, DType.Float32, numHeads, seqLen, _attnValLen);
+            Ops.AddmmBatch(attnOut, 0, attnOut, 1.0f, scores, vExpanded);
+            scores.Dispose();
+            vExpanded.Dispose();
+
+            Tensor flatOutput = ReshapeFromHeads(attnOut, numHeads, seqLen, _attnValLen);
+            attnOut.Dispose();
+
+            _attnTicks += Stopwatch.GetTimestamp() - t0;
+
+            int outIdx = layerFused ? 2 : 4;
+            Tensor output = LinearForward(flatOutput, wn[outIdx]);
+            flatOutput.Dispose();
+
+            return output;
+        }
+
+        /// <summary>
+        /// GPT-J (norm) style RoPE: pairs adjacent elements (x[2i], x[2i+1]).
+        /// Uses precomputed YaRN-corrected frequencies for decode.
+        /// </summary>
+        private unsafe void ApplyRoPEDecode(Tensor data, int numHeads, int headDim, int position)
+        {
+            int halfDim = _ropeDim / 2;
+            float* ptr = GetFloatPtr(data);
+
+            float* cosTable = stackalloc float[halfDim];
+            float* sinTable = stackalloc float[halfDim];
+            for (int i = 0; i < halfDim; i++)
+            {
+                float theta = position * _ropeFreqs[i];
+                cosTable[i] = MathF.Cos(theta);
+                sinTable[i] = MathF.Sin(theta);
+            }
+
+            for (int h = 0; h < numHeads; h++)
+            {
+                float* head = ptr + h * headDim;
+                for (int i = 0; i < halfDim; i++)
+                {
+                    float x0 = head[2 * i];
+                    float x1 = head[2 * i + 1];
+                    head[2 * i] = x0 * cosTable[i] - x1 * sinTable[i];
+                    head[2 * i + 1] = x0 * sinTable[i] + x1 * cosTable[i];
+                }
+            }
+        }
+
+        private Tensor ApplyRoPEPrefill(Tensor data, int numHeads, int headDim, int seqLen, int startPos)
+        {
+            int totalRows = seqLen * numHeads;
+            int[] positions = new int[totalRows];
+            for (int s = 0; s < seqLen; s++)
+                for (int h = 0; h < numHeads; h++)
+                    positions[s * numHeads + h] = startPos + s;
+            using var posTensor = CreateIntTensor(positions, totalRows);
+
+            using var reshaped = data.View(1, seqLen, numHeads, headDim);
+            Tensor result = Ops.RoPEEx(
+                null, reshaped, posTensor, _ropeDim, 0, _ropeOrigCtx,
+                Config.RopeBase, 1.0f / Config.RopeScale,
+                _ropeType == "yarn" ? _ropeExtFactor : 0f,
+                ComputeAttnFactor(),
+                _ropeType == "yarn" ? _ropeBetaFast : 0f,
+                _ropeType == "yarn" ? _ropeBetaSlow : 0f);
+
+            data.Dispose();
+
+            Tensor flat = result.View(seqLen, numHeads * headDim);
+            result.Dispose();
+            return flat;
+        }
+
+        private float ComputeAttnFactor()
+        {
+            if (_ropeMscale != 0 && _ropeMscaleAllDim != 0)
+                return 1.0f / (0.1f * MathF.Log(Config.RopeScale) + 1.0f);
+            return 1.0f;
+        }
+
+        /// <summary>
+        /// Position-dependent Q scaling for YaRN:
+        /// q *= (1 + beta * log(1 + floor(pos / orig_ctx)))
+        /// </summary>
+        private unsafe void ApplyPositionScale(Tensor qTensor, int totalQDim, int position)
+        {
+            float interval = MathF.Floor((float)position / _ropeOrigCtx);
+            float posScale = 1.0f + _ropeScalingBeta * MathF.Log(1.0f + interval);
+            if (MathF.Abs(posScale - 1.0f) < 1e-7f)
+                return;
+
+            float* ptr = GetFloatPtr(qTensor);
+            VecScale(ptr, posScale, totalQDim);
+        }
+
+        private unsafe void ApplyPositionScalePrefill(Tensor qTensor, int numHeads, int headDim,
+            int seqLen, int startPos)
+        {
+            float* ptr = GetFloatPtr(qTensor);
+            int stride = numHeads * headDim;
+
+            for (int s = 0; s < seqLen; s++)
+            {
+                int pos = startPos + s;
+                float interval = MathF.Floor((float)pos / _ropeOrigCtx);
+                float posScale = 1.0f + _ropeScalingBeta * MathF.Log(1.0f + interval);
+                if (MathF.Abs(posScale - 1.0f) < 1e-7f)
+                    continue;
+                VecScale(ptr + (long)s * stride, posScale, stride);
+            }
+        }
+
+        // Native batch decode is not used for Mistral 3 because YaRN applies
+        // per-dimension frequency correction that the generic TransformerLayerDecode
+        // API cannot express. The C# decode path uses GGML-backed matmul/attention
+        // and only adds a lightweight C# RoPE kernel.
+
+        public override void Dispose()
+        {
+            _visionEncoder?.Dispose();
+            foreach (var (embeddings, _) in _pendingVisionEmbeddingsList)
+                embeddings?.Dispose();
+            _pendingVisionEmbeddingsList.Clear();
+
+            if (_kvCacheK != null)
+                foreach (var t in _kvCacheK) t?.Dispose();
+            if (_kvCacheV != null)
+                foreach (var t in _kvCacheV) t?.Dispose();
+
+            base.Dispose();
+        }
+    }
+}
diff --git a/TensorSharp.Models/Models/Mistral3/Mistral3VisionEncoder.cs b/TensorSharp.Models/Models/Mistral3/Mistral3VisionEncoder.cs
new file mode 100644
index 0000000..93f3b44
--- /dev/null
+++ b/TensorSharp.Models/Models/Mistral3/Mistral3VisionEncoder.cs
@@ -0,0 +1,527 @@
+// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.Collections.Generic;
+using TensorSharp;
+using TensorSharp.Cpu;
+using TensorSharp.GGML;
+
+namespace TensorSharp.Models
+{
+    /// <summary>
+    /// Pixtral-style vision encoder for Mistral 3.
+    /// Architecture:
+    /// - Conv2D patch embedding
+    /// - RMSNorm on patch embeddings (encoder_norm)
+    /// - 2D RoPE positional embeddings (computed on-the-fly)
+    /// - Transformer blocks with RMSNorm, SiLU-gated MLP
+    /// - Patch merger with spatial merge
+    /// - Multi-modal projector: RMSNorm → PatchMerger → Linear → GELU → Linear
+    /// </summary>
+    public class Mistral3VisionEncoder : IDisposable
+    {
+        private readonly Dictionary<string, Tensor> _weights = new();
+        private readonly Dictionary<string, QuantizedWeight> _quantWeights = new();
+        private readonly Dictionary<string, Tensor> _transposedWeights = new();
+        private readonly IAllocator _allocator;
+        private readonly bool _useNativeAttention;
+
+        private readonly int _imageSize;
+        private readonly int _patchSize;
+        private readonly int _hiddenSize;
+        private readonly int _numHeads;
+        private readonly int _headDim;
+        private readonly int _blockCount;
+        private readonly float _eps;
+        private readonly float _visionRopeBase;
+        private readonly int _spatialMergeSize;
+
+        // Multi-modal projector config
+        private readonly float _textEps;
+
+        public int PatchSize => _patchSize;
+        public int SpatialMergeSize => _spatialMergeSize;
+        public int ImageSize => _imageSize;
+
+        public Mistral3VisionEncoder(string mmProjPath, IAllocator allocator)
+        {
+            _allocator = allocator;
+            _useNativeAttention = allocator is GgmlAllocator;
+            var gguf = new GgufFile(mmProjPath);
+
+            _imageSize = (int)gguf.GetUint32("vision.image_size",
+                          (uint)gguf.GetUint32("clip.vision.image_size", 1540));
+            _patchSize = (int)gguf.GetUint32("vision.patch_size",
+                          (uint)gguf.GetUint32("clip.vision.patch_size", 14));
+            _hiddenSize = (int)gguf.GetUint32("vision.embedding_length",
+                           (uint)gguf.GetUint32("clip.vision.embedding_length", 1024));
+            _numHeads = (int)gguf.GetUint32("vision.attention.head_count",
+                         (uint)gguf.GetUint32("clip.vision.attention.head_count", 16));
+            _headDim = (int)gguf.GetUint32("vision.attention.key_length",
+                        (uint)(_hiddenSize / _numHeads));
+            _blockCount = (int)gguf.GetUint32("vision.block_count",
+                           (uint)gguf.GetUint32("clip.vision.block_count", 24));
+            _eps = gguf.GetFloat32("vision.attention.layer_norm_epsilon",
+                   gguf.GetFloat32("clip.vision.attention.layer_norm_epsilon", 1e-5f));
+            _visionRopeBase = gguf.GetFloat32("vision.rope.freq_base", 10000.0f);
+            _spatialMergeSize = (int)gguf.GetUint32("spatial_merge_size", 2);
+            _textEps = gguf.GetFloat32("text_config.rms_norm_eps", 1e-5f);
+
+            Console.WriteLine($"Mistral3 Vision: imageSize={_imageSize}, patchSize={_patchSize}, " +
+                $"hidden={_hiddenSize}, heads={_numHeads}, headDim={_headDim}, " +
+                $"blocks={_blockCount}, ropeBase={_visionRopeBase}, mergeSize={_spatialMergeSize}");
+
+            LoadWeights(gguf);
+            gguf.Dispose();
+        }
+
+        private void LoadWeights(GgufFile gguf)
+        {
+            Console.Write("Loading Mistral3 vision encoder weights...");
+            int count = 0;
+            foreach (var kv in gguf.Tensors)
+            {
+                var info = kv.Value;
+                long numElements = info.NumElements;
+
+                long[] ggufShape = new long[info.Shape.Length];
+                for (int i = 0; i < info.Shape.Length; i++)
+                    ggufShape[i] = (long)info.Shape[i];
+
+                long[] tsShape = new long[ggufShape.Length];
+                for (int i = 0; i < ggufShape.Length; i++)
+                    tsShape[i] = ggufShape[ggufShape.Length - 1 - i];
+
+                if (info.Type == GgmlTensorType.F32 || info.Shape.Length < 2)
+                {
+                    float[] f32 = new float[numElements];
+                    byte[] raw = gguf.ReadTensorData(info);
+                    if (info.Type == GgmlTensorType.F32)
+                        Buffer.BlockCopy(raw, 0, f32, 0, raw.Length);
+                    else
+                        NativeDequant.DequantizeToFloat32((int)info.Type, raw, 0, f32, 0, numElements);
+
+                    var tensor = new Tensor(_allocator, DType.Float32, tsShape);
+                    tensor.SetElementsAsFloat(f32);
+                    _weights[info.Name] = tensor;
+                }
+                else
+                {
+                    byte[] raw = gguf.ReadTensorData(info);
+                    float[] f32 = new float[numElements];
+                    if (info.Type == GgmlTensorType.F32)
+                        Buffer.BlockCopy(raw, 0, f32, 0, raw.Length);
+                    else
+                        NativeDequant.DequantizeToFloat32((int)info.Type, raw, 0, f32, 0, numElements);
+
+                    var tensor = new Tensor(_allocator, DType.Float32, tsShape);
+                    tensor.SetElementsAsFloat(f32);
+                    _weights[info.Name] = tensor;
+                }
+                count++;
+            }
+            Console.WriteLine($" done ({count} tensors)");
+        }
+
+        /// <summary>
+        /// Encode an image into vision embeddings ready for the text model.
+        /// Input: normalized pixel data, image dimensions.
+        /// Output: Tensor of shape [numOutputTokens, textHiddenSize].
+        /// </summary>
+        public unsafe Tensor Encode(float[] pixelValues, int imageWidth, int imageHeight)
+        {
+            int numPatchesW = imageWidth / _patchSize;
+            int numPatchesH = imageHeight / _patchSize;
+            int numPatches = numPatchesW * numPatchesH;
+
+            // Patch embedding via Conv2D
+            var hidden = PatchEmbed(pixelValues, imageWidth, imageHeight, numPatchesW, numPatchesH);
+
+            // Encoder norm
+            using var normed = RMSNormOp(hidden, "v.encoder_norm.weight");
+            hidden.Dispose();
+            hidden = Ops.NewContiguous(normed);
+
+            // 2D RoPE positional embeddings
+            var (cos, sin) = Compute2DRoPE(numPatchesW, numPatchesH);
+
+            for (int i = 0; i < _blockCount; i++)
+            {
+                Console.Write($"\r  Vision encoder block {i + 1}/{_blockCount}...");
+                hidden = EncoderBlock(hidden, i, numPatches, cos, sin);
+            }
+            Console.WriteLine(" done");
+
+            cos.Dispose();
+            sin.Dispose();
+
+            // Multi-modal projector
+            var projected = MultiModalProject(hidden, numPatchesW, numPatchesH);
+            hidden.Dispose();
+
+            return projected;
+        }
+
+        private unsafe Tensor PatchEmbed(float[] pixelValues, int imgW, int imgH,
+            int patchesW, int patchesH)
+        {
+            int numPatches = patchesW * patchesH;
+            var result = new Tensor(_allocator, DType.Float32, numPatches, _hiddenSize);
+            float* dst = GetFloatPtr(result);
+
+            var convWeight = _weights["v.patch_conv.weight"];
+            float* wPtr = GetFloatPtr(convWeight);
+            float* biasPtr = _weights.ContainsKey("v.patch_conv.bias")
+                ? GetFloatPtr(_weights["v.patch_conv.bias"]) : null;
+
+            int C = 3;
+            int P = _patchSize;
+
+            for (int py = 0; py < patchesH; py++)
+            {
+                for (int px = 0; px < patchesW; px++)
+                {
+                    int patchIdx = py * patchesW + px;
+                    float* outPatch = dst + patchIdx * _hiddenSize;
+
+                    for (int f = 0; f < _hiddenSize; f++)
+                    {
+                        float sum = biasPtr != null ? biasPtr[f] : 0f;
+
+                        for (int c = 0; c < C; c++)
+                        {
+                            for (int ky = 0; ky < P; ky++)
+                            {
+                                for (int kx = 0; kx < P; kx++)
+                                {
+                                    int imgY = py * P + ky;
+                                    int imgX = px * P + kx;
+                                    float pixel = pixelValues[c * imgH * imgW + imgY * imgW + imgX];
+                                    int wIdx = f * C * P * P + c * P * P + ky * P + kx;
+                                    sum += pixel * wPtr[wIdx];
+                                }
+                            }
+                        }
+                        outPatch[f] = sum;
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Compute 2D RoPE embeddings for the vision transformer.
+        /// Returns (cos, sin) tensors of shape [headDim, 1, numPatches].
+        /// </summary>
+        private (Tensor cos, Tensor sin) Compute2DRoPE(int patchesW, int patchesH)
+        {
+            int maxPatchesPerSide = _imageSize / _patchSize;
+            int numPatches = patchesW * patchesH;
+            int frequencies = _headDim / 2;
+
+            float[] freqsHeight = new float[frequencies / 2 * maxPatchesPerSide];
+            float[] freqsWidth = new float[frequencies / 2 * maxPatchesPerSide];
+
+            for (int i = 0; i < frequencies; i++)
+            {
+                for (int j = 0; j < maxPatchesPerSide; j++)
+                {
+                    float frequency = (float)(j / Math.Pow(_visionRopeBase, (double)i * 2 / _headDim));
+                    if (i % 2 == 0)
+                        freqsHeight[i / 2 * maxPatchesPerSide + j] = frequency;
+                    else
+                        freqsWidth[i / 2 * maxPatchesPerSide + j] = frequency;
+                }
+            }
+
+            // Build per-position inverse frequencies
+            float[] invFreqs = new float[frequencies * numPatches];
+            for (int h = 0; h < patchesH; h++)
+            {
+                for (int w = 0; w < patchesW; w++)
+                {
+                    int patchIdx = h * patchesW + w;
+                    for (int f = 0; f < frequencies / 2; f++)
+                    {
+                        invFreqs[f * numPatches + patchIdx] = freqsHeight[f * maxPatchesPerSide + h];
+                        invFreqs[(f + frequencies / 2) * numPatches + patchIdx] = freqsWidth[f * maxPatchesPerSide + w];
+                    }
+                }
+            }
+
+            // Duplicate for cos+sin pairs
+            float[] fullFreqs = new float[_headDim * numPatches];
+            Array.Copy(invFreqs, 0, fullFreqs, 0, frequencies * numPatches);
+            Array.Copy(invFreqs, 0, fullFreqs, frequencies * numPatches, frequencies * numPatches);
+
+            // Compute cos and sin
+            float[] cosVals = new float[_headDim * numPatches];
+            float[] sinVals = new float[_headDim * numPatches];
+            for (int i = 0; i < fullFreqs.Length; i++)
+            {
+                cosVals[i] = MathF.Cos(fullFreqs[i]);
+                sinVals[i] = MathF.Sin(fullFreqs[i]);
+            }
+
+            // Reshape to [headDim, 1, numPatches]
+            var cosTensor = new Tensor(_allocator, DType.Float32, numPatches, 1, _headDim);
+            cosTensor.SetElementsAsFloat(cosVals);
+            var sinTensor = new Tensor(_allocator, DType.Float32, numPatches, 1, _headDim);
+            sinTensor.SetElementsAsFloat(sinVals);
+
+            return (cosTensor, sinTensor);
+        }
+
+        private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches,
+            Tensor cos, Tensor sin)
+        {
+            string prefix = $"v.blk.{blockIdx}";
+
+            using var normed = RMSNormOp(hidden, $"{prefix}.attn_norm.weight");
+            using var attnOut = VisionSelfAttention(normed, prefix, numPatches, cos, sin);
+
+            Ops.Add(attnOut, attnOut, hidden);
+            hidden.Dispose();
+
+            using var normed2 = RMSNormOp(attnOut, $"{prefix}.ffn_norm.weight");
+            using var mlpOut = VisionMLP(normed2, prefix);
+
+            var result = new Tensor(_allocator, DType.Float32, attnOut.Sizes);
+            Ops.Add(result, attnOut, mlpOut);
+
+            return result;
+        }
+
+        private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches,
+            Tensor cos, Tensor sin)
+        {
+            using var q = LinearForward(input, $"{prefix}.attn_q.weight");
+            using var k = LinearForward(input, $"{prefix}.attn_k.weight");
+            using var v = LinearForward(input, $"{prefix}.attn_v.weight");
+
+            // Reshape to [numPatches, numHeads, headDim]
+            using var qR = q.View(numPatches, _numHeads, _headDim);
+            using var kR = k.View(numPatches, _numHeads, _headDim);
+            using var vR = v.View(numPatches, _numHeads, _headDim);
+
+            // Apply 2D RoPE
+            var qRoped = ApplyVisionRoPE(qR, cos, sin, numPatches);
+            var kRoped = ApplyVisionRoPE(kR, cos, sin, numPatches);
+
+            float scale = 1f / MathF.Sqrt(_headDim);
+
+            if (_useNativeAttention)
+            {
+                using var q4 = qRoped.View(1, numPatches, _numHeads, _headDim);
+                using var k4 = kRoped.View(1, numPatches, _numHeads, _headDim);
+                using var v4 = vR.View(1, numPatches, _numHeads, _headDim);
+                using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale);
+                qRoped.Dispose();
+                kRoped.Dispose();
+                using var flat = attn4.View(numPatches, _hiddenSize);
+                return LinearForward(flat, $"{prefix}.attn_output.weight");
+            }
+
+            // Manual attention path
+            using var qT0 = qRoped.Transpose(0, 1);
+            using var kT0 = kRoped.Transpose(0, 1);
+            using var vT0 = vR.Transpose(0, 1);
+            using var qHeads = Ops.NewContiguous(qT0);
+            using var kHeads = Ops.NewContiguous(kT0);
+            using var vHeads = Ops.NewContiguous(vT0);
+            qRoped.Dispose();
+            kRoped.Dispose();
+
+            using var kT = kHeads.Transpose(1, 2);
+            var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches);
+            Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT);
+            Ops.Softmax(scores, scores);
+
+            var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, _headDim);
+            Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads);
+            scores.Dispose();
+
+            using var transposed = attnOutput.Transpose(0, 1);
+            using var contiguous = Ops.NewContiguous(transposed);
+            using var flatContig = contiguous.View(numPatches, _hiddenSize);
+            attnOutput.Dispose();
+
+            return LinearForward(flatContig, $"{prefix}.attn_output.weight");
+        }
+
+        /// <summary>
+        /// Apply rotary position embeddings (2D RoPE) for vision.
+        /// Uses rotate_half style: [-x1, x0] * sin + [x0, x1] * cos
+        /// </summary>
+        private unsafe Tensor ApplyVisionRoPE(Tensor input, Tensor cos, Tensor sin, int numPatches)
+        {
+            // input: [numPatches, numHeads, headDim]
+            var result = new Tensor(_allocator, DType.Float32, input.Sizes);
+            float* inPtr = GetFloatPtr(input);
+            float* outPtr = GetFloatPtr(result);
+            float* cosPtr = GetFloatPtr(cos);
+            float* sinPtr = GetFloatPtr(sin);
+
+            int halfDim = _headDim / 2;
+
+            for (int p = 0; p < numPatches; p++)
+            {
+                for (int h = 0; h < _numHeads; h++)
+                {
+                    float* inHead = inPtr + (long)p * _numHeads * _headDim + h * _headDim;
+                    float* outHead = outPtr + (long)p * _numHeads * _headDim + h * _headDim;
+
+                    for (int d = 0; d < halfDim; d++)
+                    {
+                        float x0 = inHead[d];
+                        float x1 = inHead[d + halfDim];
+                        float c = cosPtr[p * _headDim + d];
+                        float s = sinPtr[p * _headDim + d];
+
+                        // rotate_half: cos*x - sin*rotate_half(x)
+                        outHead[d] = x0 * c - x1 * s;
+                        outHead[d + halfDim] = x1 * c + x0 * s;
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        private Tensor VisionMLP(Tensor input, string prefix)
+        {
+            using var gate = LinearForward(input, $"{prefix}.ffn_gate.weight");
+            using var up = LinearForward(input, $"{prefix}.ffn_up.weight");
+            Ops.SiLUMul(gate, gate, up);
+            return LinearForward(gate, $"{prefix}.ffn_down.weight");
+        }
+
+        /// <summary>
+        /// Multi-modal projector: vision → text space.
+        /// Steps: RMSNorm → PatchMerger → Linear1 → GELU → Linear2
+        /// </summary>
+        private unsafe Tensor MultiModalProject(Tensor visionOutput, int patchesW, int patchesH)
+        {
+            int numPatches = patchesW * patchesH;
+
+            // RMSNorm
+            using var normed = RMSNormOp(visionOutput, "mm.norm.weight");
+
+            // Patch merger: merge spatialMergeSize x spatialMergeSize patches
+            int mergedW = patchesW / _spatialMergeSize;
+            int mergedH = patchesH / _spatialMergeSize;
+            int mergedPatches = mergedW * mergedH;
+            int mergeInputDim = _hiddenSize * _spatialMergeSize * _spatialMergeSize;
+
+            var mergeInput = new Tensor(_allocator, DType.Float32, mergedPatches, mergeInputDim);
+            float* srcPtr = GetFloatPtr(normed);
+            float* dstPtr = GetFloatPtr(mergeInput);
+
+            for (int my = 0; my < mergedH; my++)
+            {
+                for (int mx = 0; mx < mergedW; mx++)
+                {
+                    int outIdx = my * mergedW + mx;
+                    float* outRow = dstPtr + (long)outIdx * mergeInputDim;
+                    int fillOffset = 0;
+
+                    for (int sy = 0; sy < _spatialMergeSize; sy++)
+                    {
+                        for (int sx = 0; sx < _spatialMergeSize; sx++)
+                        {
+                            int srcY = my * _spatialMergeSize + sy;
+                            int srcX = mx * _spatialMergeSize + sx;
+                            int srcIdx = srcY * patchesW + srcX;
+                            float* srcRow = srcPtr + (long)srcIdx * _hiddenSize;
+
+                            Buffer.MemoryCopy(srcRow, outRow + fillOffset,
+                                _hiddenSize * sizeof(float), _hiddenSize * sizeof(float));
+                            fillOffset += _hiddenSize;
+                        }
+                    }
+                }
+            }
+
+            // Patch merger linear
+            using var merged = LinearForward(mergeInput, "mm.patch_merger.merging_layer.weight");
+            mergeInput.Dispose();
+
+            // Linear1 → GELU → Linear2
+            using var proj1 = LinearForward(merged, "mm.linear_1.weight");
+            Ops.GELU(proj1, proj1);
+            var proj2 = LinearForward(proj1, "mm.linear_2.weight");
+
+            Console.WriteLine($"Vision projector: {numPatches} patches → {mergedPatches} merged tokens " +
+                $"({(int)proj2.Sizes[0]}x{(int)proj2.Sizes[1]})");
+
+            return proj2;
+        }
+
+        private Tensor RMSNormOp(Tensor input, string weightName)
+        {
+            if (!_weights.ContainsKey(weightName))
+                return Ops.NewContiguous(input);
+            return Ops.RMSNorm(null, input, _weights[weightName], null, _eps);
+        }
+
+        private Tensor LinearForward(Tensor input, string weightName)
+        {
+            if (!_weights.ContainsKey(weightName))
+                return null;
+
+            var weight = _weights[weightName];
+            int seqLen = (int)input.Sizes[0];
+            int outDim = (int)weight.Sizes[0];
+
+            var result = new Tensor(_allocator, DType.Float32, seqLen, outDim);
+
+            Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input);
+            Tensor src = contiguousInput ?? input;
+            Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName));
+
+            contiguousInput?.Dispose();
+            return result;
+        }
+
+        private Tensor GetOrCreateTransposedWeight(string weightName)
+        {
+            if (_transposedWeights.TryGetValue(weightName, out var transposed))
+                return transposed;
+
+            using var weightViewT = _weights[weightName].Transpose();
+            transposed = Ops.NewContiguous(weightViewT);
+            _transposedWeights[weightName] = transposed;
+            return transposed;
+        }
+
+        private static unsafe float* GetFloatPtr(Tensor t)
+        {
+            if (t.Storage is GgmlStorage gs)
+                return (float*)gs.PtrAtElement(t.StorageOffset);
+            if (t.Storage is CpuStorage cs)
+                return (float*)cs.PtrAtElement(t.StorageOffset);
+            throw new NotSupportedException("Requires GgmlStorage or CpuStorage");
+        }
+
+        public void Dispose()
+        {
+            foreach (var w in _transposedWeights.Values)
+                w.Dispose();
+            _transposedWeights.Clear();
+            foreach (var w in _weights.Values)
+                w.Dispose();
+            _weights.Clear();
+            foreach (var qw in _quantWeights.Values)
+                qw.Dispose();
+            _quantWeights.Clear();
+        }
+    }
+}
diff --git a/InferenceEngine/Models/Nemotron/NemotronModel.cs b/TensorSharp.Models/Models/Nemotron/NemotronModel.cs
similarity index 99%
rename from InferenceEngine/Models/Nemotron/NemotronModel.cs
rename to TensorSharp.Models/Models/Nemotron/NemotronModel.cs
index 628e98d..1af610d 100644
--- a/InferenceEngine/Models/Nemotron/NemotronModel.cs
+++ b/TensorSharp.Models/Models/Nemotron/NemotronModel.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -16,7 +16,7 @@
 using TensorSharp;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     /// <summary>
     /// Nemotron-H hybrid model: mixes Mamba2 SSM layers, attention-only layers, and FFN-only layers.
@@ -280,6 +280,7 @@ private void InitMamba2Buffers()
 
         private void InitCaches(int maxSeqLen)
         {
+            _maxContextLength = maxSeqLen;
             int numLayers = Config.NumLayers;
             _kvCacheK = new Tensor[numLayers];
             _kvCacheV = new Tensor[numLayers];
@@ -1379,3 +1380,4 @@ public override void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Qwen3/Qwen3Model.cs b/TensorSharp.Models/Models/Qwen3/Qwen3Model.cs
similarity index 99%
rename from InferenceEngine/Models/Qwen3/Qwen3Model.cs
rename to TensorSharp.Models/Models/Qwen3/Qwen3Model.cs
index 737e06c..d25b50b 100644
--- a/InferenceEngine/Models/Qwen3/Qwen3Model.cs
+++ b/TensorSharp.Models/Models/Qwen3/Qwen3Model.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -12,7 +12,7 @@
 using TensorSharp;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Qwen3Model : ModelBase
     {
@@ -131,6 +131,7 @@ private void PrecomputeConstants()
 
         private void InitKVCache(int maxSeqLen)
         {
+            _maxContextLength = maxSeqLen;
             int numKVHeads = Config.NumKVHeads;
             int headDim = Config.HeadDim;
             _kvCacheK = new Tensor[Config.NumLayers];
@@ -557,3 +558,4 @@ public override void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Qwen35/ImageProcessor.cs b/TensorSharp.Models/Models/Qwen35/ImageProcessor.cs
similarity index 98%
rename from InferenceEngine/Models/Qwen35/ImageProcessor.cs
rename to TensorSharp.Models/Models/Qwen35/ImageProcessor.cs
index a197845..b49a3bc 100644
--- a/InferenceEngine/Models/Qwen35/ImageProcessor.cs
+++ b/TensorSharp.Models/Models/Qwen35/ImageProcessor.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -10,7 +10,7 @@
 using System;
 using System.IO;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Qwen35ImageProcessor
     {
@@ -122,3 +122,4 @@ private static float[] PackChannelFirst(byte[] rgba, int width, int height)
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Qwen35/Qwen35Model.cs b/TensorSharp.Models/Models/Qwen35/Qwen35Model.cs
similarity index 99%
rename from InferenceEngine/Models/Qwen35/Qwen35Model.cs
rename to TensorSharp.Models/Models/Qwen35/Qwen35Model.cs
index 6946991..9efcf5e 100644
--- a/InferenceEngine/Models/Qwen35/Qwen35Model.cs
+++ b/TensorSharp.Models/Models/Qwen35/Qwen35Model.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using TensorSharp;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     /// <summary>
     /// Qwen3.5 hybrid model: alternates GatedDeltaNet (recurrent) and FullAttention layers.
@@ -295,6 +295,7 @@ private void InitGDNBuffers()
 
         private void InitCaches(int maxSeqLen)
         {
+            _maxContextLength = maxSeqLen;
             int numLayers = Config.NumLayers;
             _kvCacheK = new Tensor[numLayers];
             _kvCacheV = new Tensor[numLayers];
@@ -1017,3 +1018,4 @@ public override void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs b/TensorSharp.Models/Models/Qwen35/Qwen35VisionEncoder.cs
similarity index 99%
rename from InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs
rename to TensorSharp.Models/Models/Qwen35/Qwen35VisionEncoder.cs
index c4c21dc..243ec65 100644
--- a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs
+++ b/TensorSharp.Models/Models/Qwen35/Qwen35VisionEncoder.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using TensorSharp.Cpu;
 using TensorSharp.GGML;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     public class Qwen35VisionEncoder : IDisposable
     {
@@ -588,3 +588,4 @@ public void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/NativeDequant.cs b/TensorSharp.Models/NativeDequant.cs
similarity index 94%
rename from InferenceEngine/NativeDequant.cs
rename to TensorSharp.Models/NativeDequant.cs
index 1b5ac30..5111769 100644
--- a/InferenceEngine/NativeDequant.cs
+++ b/TensorSharp.Models/NativeDequant.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -9,7 +9,7 @@
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
 using System;
 
-namespace InferenceEngine
+namespace TensorSharp.Models
 {
     internal static class NativeDequant
     {
@@ -34,3 +34,4 @@ public static long RowSize(int ggmlType, long ne)
         }
     }
 }
+
diff --git a/InferenceEngine/InferenceEngine.csproj b/TensorSharp.Models/TensorSharp.Models.csproj
similarity index 61%
rename from InferenceEngine/InferenceEngine.csproj
rename to TensorSharp.Models/TensorSharp.Models.csproj
index 9d1967c..9fddcf3 100644
--- a/InferenceEngine/InferenceEngine.csproj
+++ b/TensorSharp.Models/TensorSharp.Models.csproj
@@ -4,13 +4,16 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <OutputPath>bin\</OutputPath>
+    <Description>TensorSharp model architecture implementations and multimodal model utilities.</Description>
+    <PackageTags>tensor;models;llm;multimodal</PackageTags>
   </PropertyGroup>
   <ItemGroup>
     <InternalsVisibleTo Include="InferenceWeb.Tests" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="..\TensorSharp\TensorSharp.csproj" />
-    <ProjectReference Include="..\TensorSharp.GGML\TensorSharp.GGML.csproj" />
+    <ProjectReference Include="..\TensorSharp.Core\TensorSharp.Core.csproj" />
+    <ProjectReference Include="..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
+    <ProjectReference Include="..\TensorSharp.Backends.GGML\TensorSharp.Backends.GGML.csproj" />
   </ItemGroup>
   <ItemGroup>
     <PackageReference Include="NLayer" Version="1.16.0" />
@@ -22,4 +25,14 @@
     <PackageReference Include="OpenCvSharp4" Version="4.13.0.20260330" />
     <PackageReference Include="StbImageSharp" Version="2.30.15" />
   </ItemGroup>
+  <ItemGroup>
+    <None Include="..\LICENSE">
+      <Pack>True</Pack>
+      <PackagePath></PackagePath>
+    </None>
+    <None Include="..\README.md">
+      <Pack>True</Pack>
+      <PackagePath>\</PackagePath>
+    </None>
+  </ItemGroup>
 </Project>
diff --git a/InferenceEngine/BpeTokenizer.cs b/TensorSharp.Runtime/BpeTokenizer.cs
similarity index 96%
rename from InferenceEngine/BpeTokenizer.cs
rename to TensorSharp.Runtime/BpeTokenizer.cs
index 8aa19ef..998a8bb 100644
--- a/InferenceEngine/BpeTokenizer.cs
+++ b/TensorSharp.Runtime/BpeTokenizer.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using System.Text;
 using System.Text.RegularExpressions;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     public interface ITokenizer
     {
@@ -70,6 +70,10 @@ public BpeTokenizer(string[] vocab, int[] tokenTypes, string[] merges,
                     @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|" +
                     @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|" +
                     @"\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+",
+                "tekken" =>
+                    @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|" +
+                    @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|" +
+                    @"\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+",
                 _ =>
                     @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+",
             };
@@ -324,3 +328,4 @@ private struct MergeNode
         }
     }
 }
+
diff --git a/InferenceEngine/ChatTemplate.cs b/TensorSharp.Runtime/ChatTemplate.cs
similarity index 95%
rename from InferenceEngine/ChatTemplate.cs
rename to TensorSharp.Runtime/ChatTemplate.cs
index 182e1c8..32f2aa8 100644
--- a/InferenceEngine/ChatTemplate.cs
+++ b/TensorSharp.Runtime/ChatTemplate.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using System.Text;
 using System.Text.Json;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     public class ChatMessage
     {
@@ -321,6 +321,9 @@ public static string RenderFromGgufTemplate(string template, List<ChatMessage> m
             if (IsQwen35Family(architecture) && !enableThinking)
                 return RenderHardcoded(messages, addGenerationPrompt, architecture, tools, enableThinking);
 
+            if (architecture == "mistral3")
+                return RenderHardcoded(messages, addGenerationPrompt, architecture, tools, enableThinking);
+
             if (!string.IsNullOrWhiteSpace(template))
             {
                 try
@@ -371,6 +374,9 @@ private static string RenderHardcoded(List<ChatMessage> messages,
             if (architecture == "nemotron_h" || architecture == "nemotron_h_moe")
                 return RenderQwen3(messages, addGenerationPrompt, tools, enableThinking);
 
+            if (architecture == "mistral3")
+                return RenderMistral3(messages, addGenerationPrompt);
+
             return RenderQwen3(messages, addGenerationPrompt, tools, enableThinking);
         }
 
@@ -513,6 +519,12 @@ private static List<ChatMessage> InjectMultimodalTokens(List<ChatMessage> messag
                         foreach (var _ in msg.ImagePaths)
                             sb.Append("<|vision_start|><|image_pad|><|vision_end|>");
                 }
+                else if (architecture == "mistral3")
+                {
+                    if (msg.ImagePaths != null)
+                        foreach (var _ in msg.ImagePaths)
+                            sb.Append("[IMG]");
+                }
 
                 sb.Append(msg.Content ?? "");
 
@@ -528,6 +540,42 @@ private static List<ChatMessage> InjectMultimodalTokens(List<ChatMessage> messag
             return result;
         }
 
+        /// <summary>
+        /// Render Mistral 3 chat template.
+        /// Uses [SYSTEM_PROMPT]...[/SYSTEM_PROMPT] for system messages
+        /// and [INST]...[/INST] for user messages.
+        /// </summary>
+        public static string RenderMistral3(List<ChatMessage> messages, bool addGenerationPrompt = true)
+        {
+            var sb = new StringBuilder();
+            int startIdx = 0;
+
+            if (messages.Count > 0 && messages[0].Role == "system")
+            {
+                sb.Append("[SYSTEM_PROMPT]");
+                sb.Append(messages[0].Content);
+                sb.Append("[/SYSTEM_PROMPT]");
+                startIdx = 1;
+            }
+
+            for (int i = startIdx; i < messages.Count; i++)
+            {
+                var msg = messages[i];
+                if (msg.Role == "user")
+                {
+                    sb.Append("[INST]");
+                    sb.Append(msg.Content);
+                    sb.Append("[/INST]");
+                }
+                else if (msg.Role == "assistant")
+                {
+                    sb.Append(msg.Content);
+                }
+            }
+
+            return sb.ToString();
+        }
+
         /// <summary>
         /// Render GPT OSS / Harmony chat template.
         /// Matches the GGUF Jinja2 template: system message with model identity / date / channels,
@@ -900,3 +948,4 @@ public static List<int> ExpandGemma3ImageTokens(List<int> tokens, int startOfIma
         }
     }
 }
+
diff --git a/TensorSharp.Runtime/Contracts.cs b/TensorSharp.Runtime/Contracts.cs
new file mode 100644
index 0000000..edd7e61
--- /dev/null
+++ b/TensorSharp.Runtime/Contracts.cs
@@ -0,0 +1,66 @@
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.Collections.Generic;
+
+namespace TensorSharp.Runtime
+{
+    public interface IModelArchitecture : IDisposable
+    {
+        ModelConfig Config { get; }
+        ITokenizer Tokenizer { get; }
+        IKVCachePolicy KVCachePolicy { get; }
+        IMultimodalInjector MultimodalInjector { get; }
+        IBackendExecutionPlan ExecutionPlan { get; }
+        float[] Forward(int[] tokens);
+        void ResetKVCache();
+        bool SupportsKVCacheTruncation { get; }
+        void TruncateKVCache(int tokenCount);
+    }
+
+    public interface IPromptRenderer
+    {
+        string Render(
+            string template,
+            List<ChatMessage> messages,
+            bool addGenerationPrompt = true,
+            string architecture = null,
+            List<ToolFunction> tools = null,
+            bool enableThinking = false);
+    }
+
+    public interface IOutputProtocolParser
+    {
+        void Init(bool enableThinking, List<ToolFunction> tools);
+        ParsedOutput Add(string text, bool done);
+        bool HasThinkingSupport { get; }
+        bool HasToolSupport { get; }
+        bool AlwaysRequired { get; }
+    }
+
+    public interface IMultimodalInjector
+    {
+        void LoadProjectors(string mmProjPath);
+        List<int> ProcessPromptTokens(List<ChatMessage> history, List<int> inputTokens);
+    }
+
+    public interface IKVCachePolicy
+    {
+        int ComputeReusablePrefix(IModelArchitecture model, List<int> cachedTokens, List<int> inputTokens, bool hasMultimodal);
+    }
+
+    public interface IBackendExecutionPlan
+    {
+        BackendType BackendType { get; }
+        bool UsesGgmlBackend { get; }
+        bool ShouldStoreWeightQuantized(GgufTensorInfo info);
+    }
+}
+
diff --git a/TensorSharp.Runtime/DefaultKvCachePolicy.cs b/TensorSharp.Runtime/DefaultKvCachePolicy.cs
new file mode 100644
index 0000000..f3adf24
--- /dev/null
+++ b/TensorSharp.Runtime/DefaultKvCachePolicy.cs
@@ -0,0 +1,62 @@
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.Collections.Generic;
+
+namespace TensorSharp.Runtime
+{
+    public sealed class DefaultKvCachePolicy : IKVCachePolicy
+    {
+        public static DefaultKvCachePolicy Shared { get; } = new();
+
+        public int ComputeReusablePrefix(IModelArchitecture model, List<int> cachedTokens, List<int> inputTokens, bool hasMultimodal)
+        {
+            if (model == null || hasMultimodal || !model.SupportsKVCacheTruncation)
+                return 0;
+            if (cachedTokens == null || cachedTokens.Count == 0 || inputTokens == null || inputTokens.Count == 0)
+                return 0;
+
+            int raw = FindCommonPrefix(cachedTokens, inputTokens);
+            if (raw <= 0)
+                return 0;
+
+            int slidingWindow = model.Config?.SlidingWindow ?? 0;
+            if (slidingWindow > 0)
+                raw = Math.Max(0, raw - slidingWindow);
+
+            if (raw < 4)
+                return 0;
+
+            double savingsRatio = (double)raw / inputTokens.Count;
+            if (savingsRatio < 0.10)
+                return 0;
+
+            return raw;
+        }
+
+        private static int FindCommonPrefix(List<int> cachedTokens, List<int> inputTokens)
+        {
+            int maxLen = Math.Min(cachedTokens.Count, inputTokens.Count);
+            int prefix = 0;
+            for (int i = 0; i < maxLen; i++)
+            {
+                if (cachedTokens[i] != inputTokens[i])
+                    break;
+                prefix++;
+            }
+
+            if (prefix == 0 || prefix >= inputTokens.Count)
+                return 0;
+
+            return prefix;
+        }
+    }
+}
+
diff --git a/InferenceEngine/GgufReader.cs b/TensorSharp.Runtime/GgufReader.cs
similarity index 99%
rename from InferenceEngine/GgufReader.cs
rename to TensorSharp.Runtime/GgufReader.cs
index bb0ffcf..c3a83ec 100644
--- a/InferenceEngine/GgufReader.cs
+++ b/TensorSharp.Runtime/GgufReader.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using System.IO;
 using System.Text;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     public enum GgufValueType : uint
     {
@@ -468,3 +468,4 @@ public void Dispose()
         }
     }
 }
+
diff --git a/InferenceEngine/Jinja2Template.cs b/TensorSharp.Runtime/Jinja2Template.cs
similarity index 99%
rename from InferenceEngine/Jinja2Template.cs
rename to TensorSharp.Runtime/Jinja2Template.cs
index d646247..ac06f3c 100644
--- a/InferenceEngine/Jinja2Template.cs
+++ b/TensorSharp.Runtime/Jinja2Template.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using System.Linq;
 using System.Text;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     /// <summary>
     /// Minimal Jinja2 template renderer for LLM chat templates loaded from GGUF files.
@@ -1490,3 +1490,4 @@ private static List<string> SplitArgs(string s)
         #endregion
     }
 }
+
diff --git a/TensorSharp.Runtime/ModelPrimitives.cs b/TensorSharp.Runtime/ModelPrimitives.cs
new file mode 100644
index 0000000..1b96471
--- /dev/null
+++ b/TensorSharp.Runtime/ModelPrimitives.cs
@@ -0,0 +1,44 @@
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+namespace TensorSharp.Runtime
+{
+    public enum BackendType
+    {
+        Cpu,
+        GgmlCpu,
+        GgmlMetal,
+        GgmlCuda,
+    }
+
+    public class ModelConfig
+    {
+        public string Architecture { get; set; }
+        public int HiddenSize { get; set; }
+        public int NumHeads { get; set; }
+        public int NumKVHeads { get; set; }
+        public int KeyLength { get; set; }
+        public int ValueLength { get; set; }
+        public float Eps { get; set; }
+        public float RopeBase { get; set; }
+        public float RopeScale { get; set; } = 1f;
+        public int NumLayers { get; set; }
+        public int VocabSize { get; set; }
+        public int IntermediateSize { get; set; }
+        public string ChatTemplate { get; set; }
+
+        public int NumExperts { get; set; }
+        public int NumExpertsUsed { get; set; }
+        public int SlidingWindow { get; set; }
+        public int OriginalContextLength { get; set; }
+
+        public int HeadDim => KeyLength > 0 ? KeyLength : (ValueLength > 0 ? ValueLength : HiddenSize / NumHeads);
+    }
+}
+
diff --git a/InferenceEngine/OutputParser.cs b/TensorSharp.Runtime/OutputParser.cs
similarity index 98%
rename from InferenceEngine/OutputParser.cs
rename to TensorSharp.Runtime/OutputParser.cs
index efa471a..5ea8e8a 100644
--- a/InferenceEngine/OutputParser.cs
+++ b/TensorSharp.Runtime/OutputParser.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -14,7 +14,7 @@
 using System.Text.Json;
 using System.Text.RegularExpressions;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     /// <summary>
     /// Represents a tool function definition provided to the model.
@@ -64,17 +64,8 @@ public class ParsedOutput
     /// Streaming parser that extracts thinking content, regular content, and tool calls
     /// from model output. Handles model-specific tag formats.
     /// </summary>
-    public interface IOutputParser
+    public interface IOutputParser : IOutputProtocolParser
     {
-        void Init(bool enableThinking, List<ToolFunction> tools);
-        ParsedOutput Add(string text, bool done);
-        bool HasThinkingSupport { get; }
-        bool HasToolSupport { get; }
-        /// <summary>
-        /// True when the model's wire format always requires parsing (e.g. Harmony
-        /// framing), even if the caller did not request thinking or tool support.
-        /// </summary>
-        bool AlwaysRequired { get; }
     }
 
     // ========================================================================
@@ -809,3 +800,4 @@ public static bool IsAlwaysRequired(string architecture)
         }
     }
 }
+
diff --git a/TensorSharp.Runtime/PromptRenderer.cs b/TensorSharp.Runtime/PromptRenderer.cs
new file mode 100644
index 0000000..9900ae0
--- /dev/null
+++ b/TensorSharp.Runtime/PromptRenderer.cs
@@ -0,0 +1,34 @@
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System.Collections.Generic;
+
+namespace TensorSharp.Runtime
+{
+    public sealed class GgufPromptRenderer : IPromptRenderer
+    {
+        public string Render(
+            string template,
+            List<ChatMessage> messages,
+            bool addGenerationPrompt = true,
+            string architecture = null,
+            List<ToolFunction> tools = null,
+            bool enableThinking = false)
+        {
+            return ChatTemplate.RenderFromGgufTemplate(
+                template,
+                messages,
+                addGenerationPrompt,
+                architecture,
+                tools,
+                enableThinking);
+        }
+    }
+}
+
diff --git a/InferenceEngine/SamplingConfig.cs b/TensorSharp.Runtime/SamplingConfig.cs
similarity index 98%
rename from InferenceEngine/SamplingConfig.cs
rename to TensorSharp.Runtime/SamplingConfig.cs
index 6b688cb..b719629 100644
--- a/InferenceEngine/SamplingConfig.cs
+++ b/TensorSharp.Runtime/SamplingConfig.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -9,7 +9,7 @@
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
 using System.Collections.Generic;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     /// <summary>
     /// Configuration for token sampling during inference.
@@ -113,3 +113,4 @@ public class SamplingConfig
         };
     }
 }
+
diff --git a/InferenceEngine/SentencePieceTokenizer.cs b/TensorSharp.Runtime/SentencePieceTokenizer.cs
similarity index 99%
rename from InferenceEngine/SentencePieceTokenizer.cs
rename to TensorSharp.Runtime/SentencePieceTokenizer.cs
index e806034..9030e3d 100644
--- a/InferenceEngine/SentencePieceTokenizer.cs
+++ b/TensorSharp.Runtime/SentencePieceTokenizer.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -12,7 +12,7 @@
 using System.Linq;
 using System.Text;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     /// <summary>
     /// SentencePiece unigram tokenizer matching ollama's implementation.
@@ -318,3 +318,4 @@ public int Compare((float score, int a, int b) x, (float score, int a, int b) y)
         }
     }
 }
+
diff --git a/InferenceEngine/StructuredOutputs.cs b/TensorSharp.Runtime/StructuredOutputs.cs
similarity index 99%
rename from InferenceEngine/StructuredOutputs.cs
rename to TensorSharp.Runtime/StructuredOutputs.cs
index 74bc45a..516063e 100644
--- a/InferenceEngine/StructuredOutputs.cs
+++ b/TensorSharp.Runtime/StructuredOutputs.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -15,7 +15,7 @@
 using System.Text.Json.Nodes;
 using System.Text.RegularExpressions;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     public enum StructuredOutputKind
     {
@@ -1024,3 +1024,4 @@ public SchemaValidationContext(JsonElement rootSchema, List<string> errors)
         }
     }
 }
+
diff --git a/TensorSharp.Runtime/TensorSharp.Runtime.csproj b/TensorSharp.Runtime/TensorSharp.Runtime.csproj
new file mode 100644
index 0000000..38aa5c6
--- /dev/null
+++ b/TensorSharp.Runtime/TensorSharp.Runtime.csproj
@@ -0,0 +1,20 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <TargetFramework>net10.0</TargetFramework>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
+    <OutputPath>bin\</OutputPath>
+    <Description>TensorSharp runtime services: GGUF parsing, tokenizers, prompt templates, sampling, and output parsing.</Description>
+    <PackageTags>tensor;runtime;gguf;tokenizer;sampling</PackageTags>
+  </PropertyGroup>
+  <ItemGroup>
+    <None Include="..\LICENSE">
+      <Pack>True</Pack>
+      <PackagePath></PackagePath>
+    </None>
+    <None Include="..\README.md">
+      <Pack>True</Pack>
+      <PackagePath>\</PackagePath>
+    </None>
+  </ItemGroup>
+</Project>
diff --git a/InferenceEngine/TokenSampler.cs b/TensorSharp.Runtime/TokenSampler.cs
similarity index 99%
rename from InferenceEngine/TokenSampler.cs
rename to TensorSharp.Runtime/TokenSampler.cs
index 99ddafa..9eb3863 100644
--- a/InferenceEngine/TokenSampler.cs
+++ b/TensorSharp.Runtime/TokenSampler.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -10,7 +10,7 @@
 using System;
 using System.Collections.Generic;
 
-namespace InferenceEngine
+namespace TensorSharp.Runtime
 {
     /// <summary>
     /// Token sampler supporting temperature, top-k, top-p (nucleus), min-p,
@@ -339,3 +339,4 @@ private static int Argmax(float[] values)
         #endregion
     }
 }
+
diff --git a/InferenceWeb/API_EXAMPLES.md b/TensorSharp.Server/API_EXAMPLES.md
similarity index 97%
rename from InferenceWeb/API_EXAMPLES.md
rename to TensorSharp.Server/API_EXAMPLES.md
index 74118c5..02ad384 100644
--- a/InferenceWeb/API_EXAMPLES.md
+++ b/TensorSharp.Server/API_EXAMPLES.md
@@ -1,6 +1,6 @@
-# InferenceWeb API Examples
+﻿# TensorSharp.Server API Examples
 
-InferenceWeb provides three API styles:
+TensorSharp.Server provides three API styles:
 - **Ollama-compatible** (`/api/generate`, `/api/chat/ollama`, `/api/tags`, `/api/show`)
 - **OpenAI-compatible** (`/v1/chat/completions`, `/v1/models`)
 - **Web UI** (`/api/chat`, `/api/models`, `/api/models/load`)
@@ -10,7 +10,7 @@ For the Web UI flow, choose the model up front with `/api/models/load`. The `/ap
 ## Starting the Server
 
 ```bash
-MODEL_DIR=~/work/model BACKEND=ggml_metal ./InferenceWeb
+MODEL_DIR=~/work/model BACKEND=ggml_metal ./TensorSharp.Server
 ```
 
 The server starts on `http://localhost:5000`.
@@ -303,7 +303,7 @@ Response:
 
 ### Chat Completions with Structured Outputs (`json_schema`)
 
-InferenceWeb accepts the OpenAI Chat Completions `response_format` shape, injects strict JSON instructions into the prompt, and validates the final output before returning it.
+TensorSharp.Server accepts the OpenAI Chat Completions `response_format` shape, injects strict JSON instructions into the prompt, and validates the final output before returning it.
 
 ```bash
 curl -X POST http://localhost:5000/v1/chat/completions \
@@ -556,3 +556,4 @@ while IFS= read -r line; do
   echo -e "\n"
 done < test_requests.jsonl
 ```
+
diff --git a/InferenceWeb/BackendCatalog.cs b/TensorSharp.Server/BackendCatalog.cs
similarity index 95%
rename from InferenceWeb/BackendCatalog.cs
rename to TensorSharp.Server/BackendCatalog.cs
index 46aa8c9..62aa874 100644
--- a/InferenceWeb/BackendCatalog.cs
+++ b/TensorSharp.Server/BackendCatalog.cs
@@ -1,16 +1,15 @@
-using System;
+﻿using System;
 using System.Collections.Generic;
 using System.Linq;
-using InferenceEngine;
 using TensorSharp.GGML;
 
-namespace InferenceWeb
+namespace TensorSharp.Server
 {
     internal sealed record BackendOption(string Value, string Label);
 
     internal static class BackendCatalog
     {
-        // InferenceWeb should always expose the two CPU choices distinctly:
+        // TensorSharp.Server should always expose the two CPU choices distinctly:
         // `ggml_cpu` is the native GGML CPU backend, while `cpu` is the pure C# backend.
         private static readonly BackendDescriptor[] BackendDescriptors =
         {
@@ -86,3 +85,6 @@ private static bool IsGgmlBackendAvailable(GgmlBackendType backendType)
         private sealed record BackendDescriptor(string Value, string Label, GgmlBackendType GgmlBackendType, bool AlwaysAvailable);
     }
 }
+
+
+
diff --git a/TensorSharp.Server/GlobalUsings.cs b/TensorSharp.Server/GlobalUsings.cs
new file mode 100644
index 0000000..1f7dd10
--- /dev/null
+++ b/TensorSharp.Server/GlobalUsings.cs
@@ -0,0 +1,3 @@
+global using TensorSharp.Models;
+global using TensorSharp.Runtime;
+global using TensorSharp.Server;
diff --git a/InferenceWeb/InferenceQueue.cs b/TensorSharp.Server/InferenceQueue.cs
similarity index 98%
rename from InferenceWeb/InferenceQueue.cs
rename to TensorSharp.Server/InferenceQueue.cs
index d69f405..f8d6f0d 100644
--- a/InferenceWeb/InferenceQueue.cs
+++ b/TensorSharp.Server/InferenceQueue.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -13,7 +13,7 @@
 using System.Threading;
 using System.Threading.Tasks;
 
-namespace InferenceWeb
+namespace TensorSharp.Server
 {
     /// <summary>
     /// FIFO request queue that ensures only one inference runs at a time,
@@ -206,3 +206,4 @@ public void Dispose()
         }
     }
 }
+
diff --git a/InferenceWeb/ModelService.cs b/TensorSharp.Server/ModelService.cs
similarity index 88%
rename from InferenceWeb/ModelService.cs
rename to TensorSharp.Server/ModelService.cs
index 4f8ddb1..b5ae0cc 100644
--- a/InferenceWeb/ModelService.cs
+++ b/TensorSharp.Server/ModelService.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -16,14 +16,14 @@
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
-using InferenceEngine;
 using TensorSharp;
 using TensorSharp.Cpu;
 
-namespace InferenceWeb
+namespace TensorSharp.Server
 {
     public class ModelService : IDisposable
     {
+        private readonly IPromptRenderer _promptRenderer = new GgufPromptRenderer();
         private ModelBase _model;
         private string _loadedModelPath;
         private string _loadedMmProjPath;
@@ -34,6 +34,7 @@ public class ModelService : IDisposable
         public bool IsLoaded => _model != null;
         public string LoadedModelName => _loadedModelPath != null ? Path.GetFileName(_loadedModelPath) : null;
         public string LoadedModelPath => _loadedModelPath;
+        public string LoadedMmProjName => _loadedMmProjPath != null ? Path.GetFileName(_loadedMmProjPath) : null;
         public string LoadedBackend => _model != null ? BackendCatalog.ToBackendValue(_backend) : null;
         public string Architecture => _model?.Config?.Architecture;
         public ModelBase Model => _model;
@@ -54,6 +55,7 @@ public void InvalidateKVCache()
 
         /// <summary>
         /// Load a model. Must be called within the InferenceQueue to prevent concurrent access.
+        /// When mmProjPath is null, auto-detection is used. Pass empty string to skip mmproj loading.
         /// </summary>
         public void LoadModel(string modelPath, string mmProjPath, string backendStr)
         {
@@ -78,7 +80,7 @@ public void LoadModel(string modelPath, string mmProjPath, string backendStr)
             if (mmProjPath == null)
                 mmProjPath = AutoDetectMmProj(modelPath);
 
-            if (mmProjPath != null && File.Exists(mmProjPath))
+            if (!string.IsNullOrEmpty(mmProjPath) && File.Exists(mmProjPath))
             {
                 LoadEncoders(mmProjPath);
                 _loadedMmProjPath = mmProjPath;
@@ -113,19 +115,7 @@ private string AutoDetectMmProj(string modelPath)
 
         private void LoadEncoders(string mmProjPath)
         {
-            switch (_model)
-            {
-                case Gemma4Model g4:
-                    g4.LoadVisionEncoder(mmProjPath);
-                    g4.LoadAudioEncoder(mmProjPath);
-                    break;
-                case Gemma3Model g3:
-                    g3.LoadVisionEncoder(mmProjPath);
-                    break;
-                case Qwen35Model q35:
-                    q35.LoadVisionEncoder(mmProjPath);
-                    break;
-            }
+            _model?.MultimodalInjector.LoadProjectors(mmProjPath);
         }
 
         /// <summary>
@@ -141,7 +131,7 @@ public async IAsyncEnumerable<string> ChatStreamAsync(
         {
             string arch = _model.Config.Architecture;
             var preparedHistory = PrepareHistoryForInference(history, arch);
-            string rendered = ChatTemplate.RenderFromGgufTemplate(
+            string rendered = _promptRenderer.Render(
                 _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true,
                 architecture: arch, tools: tools, enableThinking: enableThinking);
 
@@ -151,7 +141,21 @@ public async IAsyncEnumerable<string> ChatStreamAsync(
             bool hasMultimodal = HasMultimodalContent(preparedHistory);
 
             var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true);
-            inputTokens = ProcessMultimodalHistory(preparedHistory, inputTokens, arch);
+            inputTokens = _model.MultimodalInjector.ProcessPromptTokens(preparedHistory, inputTokens);
+
+            int maxCtx = _model.MaxContextLength;
+            if (maxCtx > 0 && inputTokens.Count + maxTokens > maxCtx)
+            {
+                int available = maxCtx - maxTokens;
+                if (available < 1)
+                    throw new InvalidOperationException(
+                        $"Prompt ({inputTokens.Count} tokens) exceeds the model's context limit ({maxCtx} tokens). " +
+                        "Please shorten the input or reduce attached file size.");
+
+                Console.WriteLine($"[Context] Truncating prompt from {inputTokens.Count} to {available} tokens (context limit {maxCtx}, reserving {maxTokens} for generation)");
+                inputTokens = inputTokens.GetRange(inputTokens.Count - available, available);
+                _cachedTokens = null;
+            }
 
             float[] logits;
             int commonPrefix = ComputeUsablePrefix(inputTokens, hasMultimodal);
@@ -468,12 +472,25 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB
             };
 
             var preparedMessages = PrepareHistoryForInference(messages, arch);
-            string rendered = ChatTemplate.RenderFromGgufTemplate(
+            string rendered = _promptRenderer.Render(
                 _model.Config.ChatTemplate, preparedMessages, addGenerationPrompt: true,
                 architecture: arch);
 
             var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true);
-            inputTokens = ProcessMultimodalHistory(preparedMessages, inputTokens, arch);
+            inputTokens = _model.MultimodalInjector.ProcessPromptTokens(preparedMessages, inputTokens);
+
+            int maxCtx = _model.MaxContextLength;
+            if (maxCtx > 0 && inputTokens.Count + maxTokens > maxCtx)
+            {
+                int available = maxCtx - maxTokens;
+                if (available < 1)
+                    throw new InvalidOperationException(
+                        $"Prompt ({inputTokens.Count} tokens) exceeds the model's context limit ({maxCtx} tokens). " +
+                        "Please shorten the input or reduce attached file size.");
+
+                Console.WriteLine($"[Context] Truncating prompt from {inputTokens.Count} to {available} tokens (context limit {maxCtx}, reserving {maxTokens} for generation)");
+                inputTokens = inputTokens.GetRange(inputTokens.Count - available, available);
+            }
 
             InvalidateKVCache();
 
@@ -534,7 +551,7 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB
         {
             string arch = _model.Config.Architecture;
             var preparedHistory = PrepareHistoryForInference(history, arch);
-            string rendered = ChatTemplate.RenderFromGgufTemplate(
+            string rendered = _promptRenderer.Render(
                 _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true,
                 architecture: arch, tools: tools, enableThinking: enableThinking);
 
@@ -544,7 +561,21 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB
             bool hasMultimodal = HasMultimodalContent(preparedHistory);
 
             var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true);
-            inputTokens = ProcessMultimodalHistory(preparedHistory, inputTokens, arch);
+            inputTokens = _model.MultimodalInjector.ProcessPromptTokens(preparedHistory, inputTokens);
+
+            int maxCtx = _model.MaxContextLength;
+            if (maxCtx > 0 && inputTokens.Count + maxTokens > maxCtx)
+            {
+                int available = maxCtx - maxTokens;
+                if (available < 1)
+                    throw new InvalidOperationException(
+                        $"Prompt ({inputTokens.Count} tokens) exceeds the model's context limit ({maxCtx} tokens). " +
+                        "Please shorten the input or reduce attached file size.");
+
+                Console.WriteLine($"[Context] Truncating prompt from {inputTokens.Count} to {available} tokens (context limit {maxCtx}, reserving {maxTokens} for generation)");
+                inputTokens = inputTokens.GetRange(inputTokens.Count - available, available);
+                _cachedTokens = null;
+            }
 
             int promptTokenCount;
             var sw = Stopwatch.StartNew();
@@ -648,44 +679,7 @@ private static int FindValidUtf8Length(List<byte> bytes)
         /// </summary>
         private int ComputeUsablePrefix(List<int> inputTokens, bool hasMultimodal)
         {
-            if (hasMultimodal || !_model.SupportsKVCacheTruncation)
-                return 0;
-
-            int raw = FindTokenPrefixLength(_cachedTokens, inputTokens);
-            if (raw <= 0)
-                return 0;
-
-            // For SWA models, back up by slidingWindow so the suffix re-processes
-            // enough tokens to rebuild the SWA ring buffer with fresh, ordered K/V.
-            int swa = _model.Config.SlidingWindow;
-            if (swa > 0)
-            {
-                int backed = Math.Max(0, raw - swa);
-                Console.WriteLine($"[KV cache] SWA back-up: raw prefix {raw} → {backed} (window={swa})");
-                raw = backed;
-            }
-
-            if (raw < 4)
-            {
-                Console.WriteLine($"[KV cache] Common prefix too short ({raw} tokens), doing full reset");
-                return 0;
-            }
-
-            double savingsRatio = (double)raw / inputTokens.Count;
-            if (savingsRatio < 0.10)
-            {
-                Console.WriteLine($"[KV cache] Savings too small ({raw}/{inputTokens.Count} = {100 * savingsRatio:F0}%), doing full reset");
-                return 0;
-            }
-
-            if (_cachedTokens != null && raw < _cachedTokens.Count)
-            {
-                string cachedTokStr = _cachedTokens.Count > raw ? _cachedTokens[raw].ToString() : "N/A";
-                string newTokStr = inputTokens.Count > raw ? inputTokens[raw].ToString() : "N/A";
-                Console.WriteLine($"[KV cache] Divergence at index {raw}: cached={cachedTokStr}, new={newTokStr} (cached total={_cachedTokens.Count}, new total={inputTokens.Count})");
-            }
-
-            return raw;
+            return _model?.KVCachePolicy.ComputeReusablePrefix(_model, _cachedTokens, inputTokens, hasMultimodal) ?? 0;
         }
 
         /// <summary>
@@ -809,10 +803,26 @@ public List<string> ScanModels(string directory)
             if (!Directory.Exists(directory)) return new List<string>();
             return Directory.GetFiles(directory, "*.gguf")
                 .Select(Path.GetFileName)
+                .Where(f => !IsMmProjFile(f))
+                .OrderBy(f => f)
+                .ToList();
+        }
+
+        public List<string> ScanMmProjModels(string directory)
+        {
+            if (!Directory.Exists(directory)) return new List<string>();
+            return Directory.GetFiles(directory, "*.gguf")
+                .Select(Path.GetFileName)
+                .Where(IsMmProjFile)
                 .OrderBy(f => f)
                 .ToList();
         }
 
+        private static bool IsMmProjFile(string fileName)
+        {
+            return fileName.IndexOf("mmproj", StringComparison.OrdinalIgnoreCase) >= 0;
+        }
+
         public void Dispose()
         {
             _model?.Dispose();
@@ -820,3 +830,5 @@ public void Dispose()
         }
     }
 }
+
+
diff --git a/InferenceWeb/OpenAIResponseFormatParser.cs b/TensorSharp.Server/OpenAIResponseFormatParser.cs
similarity index 97%
rename from InferenceWeb/OpenAIResponseFormatParser.cs
rename to TensorSharp.Server/OpenAIResponseFormatParser.cs
index e1215b5..eca3956 100644
--- a/InferenceWeb/OpenAIResponseFormatParser.cs
+++ b/TensorSharp.Server/OpenAIResponseFormatParser.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -10,9 +10,8 @@
 
 using System;
 using System.Text.Json;
-using InferenceEngine;
 
-namespace InferenceWeb
+namespace TensorSharp.Server
 {
     public static class OpenAIResponseFormatParser
     {
@@ -108,3 +107,5 @@ strictEl.ValueKind is JsonValueKind.True or JsonValueKind.False &&
         }
     }
 }
+
+
diff --git a/InferenceWeb/Program.cs b/TensorSharp.Server/Program.cs
similarity index 96%
rename from InferenceWeb/Program.cs
rename to TensorSharp.Server/Program.cs
index 7a20a2c..3bd0ec5 100644
--- a/InferenceWeb/Program.cs
+++ b/TensorSharp.Server/Program.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+﻿// Copyright (c) Zhongkai Fu. All rights reserved.
 // https://github.com/zhongkaifu/TensorSharp
 //
 // This file is part of TensorSharp.
@@ -14,8 +14,6 @@
 using System.Linq;
 using System.Text;
 using System.Text.Json;
-using InferenceEngine;
-using InferenceWeb;
 using Microsoft.AspNetCore.Builder;
 using Microsoft.AspNetCore.Hosting;
 using Microsoft.AspNetCore.Http;
@@ -64,6 +62,11 @@
     RequestPath = "/uploads"
 });
 
+int maxTextFileChars = 8000;
+string maxTextEnv = Environment.GetEnvironmentVariable("MAX_TEXT_FILE_CHARS");
+if (!string.IsNullOrEmpty(maxTextEnv) && int.TryParse(maxTextEnv, out int envMax) && envMax > 0)
+    maxTextFileChars = envMax;
+
 string modelDir = Environment.GetEnvironmentVariable("MODEL_DIR")
     ?? Path.Combine(AppContext.BaseDirectory, "models");
 string configuredBackend = Environment.GetEnvironmentVariable("BACKEND")
@@ -123,10 +126,13 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
 app.MapGet("/api/models", (ModelService svc) =>
 {
     var files = svc.ScanModels(modelDir);
+    var mmProjFiles = svc.ScanMmProjModels(modelDir);
     return Results.Json(new
     {
         models = files,
+        mmProjModels = mmProjFiles,
         loaded = svc.LoadedModelName,
+        loadedMmProj = svc.LoadedMmProjName,
         loadedBackend = svc.LoadedBackend,
         defaultBackend,
         supportedBackends,
@@ -149,7 +155,23 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
     if (!File.Exists(modelPath))
         return Results.NotFound(new { error = $"Model not found: {modelName}" });
 
-    string mmProjPath = mmproj != null ? Path.Combine(modelDir, mmproj) : null;
+    // mmproj handling:
+    //   null/absent  -> auto-detect (ModelService default)
+    //   ""/"none"    -> explicitly no mmproj (pass empty string to skip auto-detect)
+    //   "filename"   -> use that specific mmproj file
+    string mmProjPath;
+    if (mmproj == null)
+    {
+        mmProjPath = null; // auto-detect
+    }
+    else if (string.IsNullOrWhiteSpace(mmproj) || string.Equals(mmproj, "none", StringComparison.OrdinalIgnoreCase))
+    {
+        mmProjPath = ""; // explicit skip
+    }
+    else
+    {
+        mmProjPath = Path.Combine(modelDir, mmproj);
+    }
 
     using var ticket = queue.Enqueue(ctx.RequestAborted);
     await ticket.WaitUntilReadyAsync();
@@ -161,6 +183,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
         {
             ok = true,
             model = svc.LoadedModelName,
+            loadedMmProj = svc.LoadedMmProjName,
             architecture = svc.Architecture
         });
     }
@@ -192,6 +215,11 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
         ".png" or ".jpg" or ".jpeg" or ".gif" or ".webp" or ".bmp" => "image",
         ".mp4" or ".mov" or ".avi" or ".mkv" or ".webm" => "video",
         ".mp3" or ".wav" or ".ogg" or ".flac" or ".m4a" => "audio",
+        ".txt" or ".csv" or ".json" or ".xml" or ".md" or ".log"
+            or ".py" or ".js" or ".ts" or ".cs" or ".java" or ".cpp" or ".c" or ".h"
+            or ".html" or ".css" or ".yaml" or ".yml" or ".toml" or ".ini" or ".cfg"
+            or ".sh" or ".bat" or ".ps1" or ".rb" or ".go" or ".rs" or ".swift"
+            or ".kt" or ".sql" or ".r" or ".m" or ".tex" or ".rtf" => "text",
         _ => "unknown"
     };
 
@@ -209,6 +237,18 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
         });
     }
 
+    if (mediaType == "text")
+    {
+        string textContent = await File.ReadAllTextAsync(savePath);
+        bool truncated = false;
+        if (textContent.Length > maxTextFileChars)
+        {
+            textContent = textContent.Substring(0, maxTextFileChars);
+            truncated = true;
+        }
+        return Results.Json(new { ok = true, path = savePath, mediaType, fileName = file.FileName, textContent, truncated });
+    }
+
     return Results.Json(new { ok = true, path = savePath, mediaType, fileName = file.FileName });
 });
 
@@ -294,6 +334,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
     }
 
     bool aborted = false;
+    string inferenceError = null;
     try
     {
         await foreach (var piece in svc.ChatStreamAsync(messages, maxTokens, ctx.RequestAborted, samplingConfig,
@@ -338,6 +379,11 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
         }
     }
     catch (OperationCanceledException) { aborted = true; }
+    catch (Exception ex)
+    {
+        Console.Error.WriteLine($"[Chat error] {ex.Message}");
+        inferenceError = ex.Message;
+    }
 
     try
     {
@@ -374,7 +420,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
         sw.Stop();
         double tokPerSec = tokenCount > 0 ? tokenCount / sw.Elapsed.TotalSeconds : 0;
         string done = JsonSerializer.Serialize(new { done = true, tokenCount, elapsed = sw.Elapsed.TotalSeconds, tokPerSec,
-            aborted });
+            aborted, error = inferenceError });
         await ctx.Response.WriteAsync($"data: {done}\n\n");
         await ctx.Response.Body.FlushAsync();
     }
@@ -385,7 +431,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack
 // Ollama-compatible API endpoints
 // ============================================================
 
-app.MapGet("/", () => Results.Ok("InferenceWeb is running"));
+app.MapGet("/", () => Results.Ok("TensorSharp.Server is running"));
 app.MapGet("/api/version", () => Results.Json(new { version = "0.1.0" }));
 
 app.MapGet("/api/tags", (ModelService svc) =>
@@ -1604,7 +1650,7 @@ static string ResolveModelPath(string modelName, string modelDir)
 
 Console.WriteLine($"Model directory: {modelDir}");
 Console.WriteLine($"Video max frames: {MediaHelper.GetConfiguredMaxVideoFrames()}");
-Console.WriteLine("Starting InferenceWeb on http://localhost:5000");
+Console.WriteLine("Starting TensorSharp.Server on http://localhost:5000");
 Console.WriteLine("API endpoints:");
 Console.WriteLine("  GET  /                         - Health check");
 Console.WriteLine("  GET  /api/tags                  - List available models (Ollama)");
@@ -1617,3 +1663,6 @@ static string ResolveModelPath(string modelName, string modelDir)
 Console.WriteLine("  POST /api/models/load           - Load model (Web UI)");
 Console.WriteLine("  GET  /api/models                - List models (Web UI)");
 app.Run("http://0.0.0.0:5000");
+
+
+
diff --git a/InferenceWeb/InferenceWeb.csproj b/TensorSharp.Server/TensorSharp.Server.csproj
similarity index 76%
rename from InferenceWeb/InferenceWeb.csproj
rename to TensorSharp.Server/TensorSharp.Server.csproj
index efcbddb..a9dc841 100644
--- a/InferenceWeb/InferenceWeb.csproj
+++ b/TensorSharp.Server/TensorSharp.Server.csproj
@@ -4,6 +4,7 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <OutputPath>bin\</OutputPath>
+    <Description>HTTP and web server host for TensorSharp runtime and model services.</Description>
   </PropertyGroup>
   <ItemGroup>
     <InternalsVisibleTo Include="InferenceWeb.Tests" />
@@ -14,7 +15,9 @@
     <GgmlNativeBinaryName Condition="$([MSBuild]::IsOSPlatform('Linux'))">libGgmlOps.so</GgmlNativeBinaryName>
   </PropertyGroup>
   <ItemGroup>
-    <ProjectReference Include="..\InferenceEngine\InferenceEngine.csproj" />
+    <ProjectReference Include="..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
+    <ProjectReference Include="..\TensorSharp.Models\TensorSharp.Models.csproj" />
+    <ProjectReference Include="..\TensorSharp.Backends.GGML\TensorSharp.Backends.GGML.csproj" />
   </ItemGroup>
   <ItemGroup>
     <Content Update="wwwroot\**" CopyToOutputDirectory="PreserveNewest" />
diff --git a/InferenceWeb/WebUiChatPolicy.cs b/TensorSharp.Server/WebUiChatPolicy.cs
similarity index 94%
rename from InferenceWeb/WebUiChatPolicy.cs
rename to TensorSharp.Server/WebUiChatPolicy.cs
index 4713c28..3be4823 100644
--- a/InferenceWeb/WebUiChatPolicy.cs
+++ b/TensorSharp.Server/WebUiChatPolicy.cs
@@ -1,4 +1,4 @@
-namespace InferenceWeb;
+﻿namespace TensorSharp.Server;
 
 internal static class WebUiChatPolicy
 {
@@ -17,3 +17,4 @@ public static bool TryValidateChatRequest(string requestedModel, string requeste
         return false;
     }
 }
+
diff --git a/InferenceWeb/test_requests.jsonl b/TensorSharp.Server/test_requests.jsonl
similarity index 100%
rename from InferenceWeb/test_requests.jsonl
rename to TensorSharp.Server/test_requests.jsonl
diff --git a/InferenceWeb/testdata/README.md b/TensorSharp.Server/testdata/README.md
similarity index 94%
rename from InferenceWeb/testdata/README.md
rename to TensorSharp.Server/testdata/README.md
index 7c83b28..f290d05 100644
--- a/InferenceWeb/testdata/README.md
+++ b/TensorSharp.Server/testdata/README.md
@@ -1,14 +1,14 @@
-# InferenceWeb Multi-Turn Chat Integration Tests
+﻿# TensorSharp.Server Multi-Turn Chat Integration Tests
 
-Two test suites that simulate real users having long multi-turn conversations with InferenceWeb across all API surfaces.
+Two test suites that simulate real users having long multi-turn conversations with TensorSharp.Server across all API surfaces.
 
 For the Web UI flow, the tests load a model once through `/api/models/load` and then send turns to `/api/chat` without per-request model switching.
 
 ## Quick Start
 
-1. Start InferenceWeb:
+1. Start TensorSharp.Server:
 ```bash
-MODEL_DIR=~/models BACKEND=ggml_metal ./InferenceWeb
+MODEL_DIR=~/models BACKEND=ggml_metal ./TensorSharp.Server
 ```
 
 2. Run tests (pick one):
@@ -75,3 +75,4 @@ python3 test_multiturn.py --max-tokens 120                      # longer respons
 - **Structured outputs**: OpenAI-style `response_format` schemas are validated and enforced
 - **Abort support**: Mid-generation cancellation works and releases the queue
 - **Metrics**: Timing and token count metrics are present in done events
+
diff --git a/InferenceWeb/testdata/test_multiturn.py b/TensorSharp.Server/testdata/test_multiturn.py
similarity index 99%
rename from InferenceWeb/testdata/test_multiturn.py
rename to TensorSharp.Server/testdata/test_multiturn.py
index 9887ee8..5a16933 100644
--- a/InferenceWeb/testdata/test_multiturn.py
+++ b/TensorSharp.Server/testdata/test_multiturn.py
@@ -1,6 +1,6 @@
-#!/usr/bin/env python3
+﻿#!/usr/bin/env python3
 """
-Multi-Turn Chat Integration Tests for InferenceWeb
+Multi-Turn Chat Integration Tests for TensorSharp.Server
 
 Runs comprehensive multi-turn conversation tests against all API surfaces,
 validating response structure, context retention, and edge cases.
@@ -619,7 +619,7 @@ def test_queue_status(self):
     # Run all tests
     # =========================================================================
     def run_all(self):
-        self.header("InferenceWeb Multi-Turn Integration Tests (Python)")
+        self.header("TensorSharp.Server Multi-Turn Integration Tests (Python)")
 
         self.log(f"Server: {self.base_url}")
         self.log(f"Model:  {self.model}")
@@ -673,9 +673,9 @@ def run_all(self):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Multi-turn chat integration tests for InferenceWeb")
+    parser = argparse.ArgumentParser(description="Multi-turn chat integration tests for TensorSharp.Server")
     parser.add_argument("--model", type=str, default=None, help="Model filename (auto-detected if omitted)")
-    parser.add_argument("--url", type=str, default="http://localhost:5000", help="InferenceWeb base URL")
+    parser.add_argument("--url", type=str, default="http://localhost:5000", help="TensorSharp.Server base URL")
     parser.add_argument("--max-tokens", type=int, default=80, help="Max tokens per response")
     args = parser.parse_args()
 
@@ -701,3 +701,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+
diff --git a/InferenceWeb/testdata/test_multiturn.sh b/TensorSharp.Server/testdata/test_multiturn.sh
similarity index 99%
rename from InferenceWeb/testdata/test_multiturn.sh
rename to TensorSharp.Server/testdata/test_multiturn.sh
index a222f08..3618e79 100644
--- a/InferenceWeb/testdata/test_multiturn.sh
+++ b/TensorSharp.Server/testdata/test_multiturn.sh
@@ -1,6 +1,6 @@
-#!/usr/bin/env bash
+﻿#!/usr/bin/env bash
 # =============================================================================
-# Multi-Turn Chat Integration Tests for InferenceWeb
+# Multi-Turn Chat Integration Tests for TensorSharp.Server
 #
 # Tests long multi-turn conversations across all API surfaces:
 #   1. Web UI SSE API (/api/chat)
@@ -8,7 +8,7 @@
 #   3. OpenAI-compatible API (/v1/chat/completions)
 #
 # Prerequisites:
-#   - InferenceWeb running on localhost:5000
+#   - TensorSharp.Server running on localhost:5000
 #   - At least one .gguf model available in MODEL_DIR
 #   - curl and jq installed
 #
@@ -813,7 +813,7 @@ test_abort_generation() {
 # =============================================================================
 
 main() {
-    header "InferenceWeb Multi-Turn Chat Integration Tests"
+    header "TensorSharp.Server Multi-Turn Chat Integration Tests"
     check_deps
     wait_for_server
     auto_detect_model
@@ -850,3 +850,4 @@ main() {
 }
 
 main
+
diff --git a/InferenceWeb/wwwroot/images/assistant_logo.png b/TensorSharp.Server/wwwroot/images/assistant_logo.png
similarity index 100%
rename from InferenceWeb/wwwroot/images/assistant_logo.png
rename to TensorSharp.Server/wwwroot/images/assistant_logo.png
diff --git a/InferenceWeb/wwwroot/images/banner_1.png b/TensorSharp.Server/wwwroot/images/banner_1.png
similarity index 52%
rename from InferenceWeb/wwwroot/images/banner_1.png
rename to TensorSharp.Server/wwwroot/images/banner_1.png
index 6a4a87e..beffe29 100644
Binary files a/InferenceWeb/wwwroot/images/banner_1.png and b/TensorSharp.Server/wwwroot/images/banner_1.png differ
diff --git a/InferenceWeb/wwwroot/index.html b/TensorSharp.Server/wwwroot/index.html
similarity index 92%
rename from InferenceWeb/wwwroot/index.html
rename to TensorSharp.Server/wwwroot/index.html
index ab5a434..42b1c05 100644
--- a/InferenceWeb/wwwroot/index.html
+++ b/TensorSharp.Server/wwwroot/index.html
@@ -781,7 +781,7 @@ <h1>
   <div id="chat-container">
     <div class="empty-state" id="empty-state">
       <img class="big-logo" src="/images/banner_1.png" alt="TensorSharp Chat logo">
-      <p>Load a model and start chatting. You can attach images, videos, and audio files for multimodal inference.</p>
+      <p>Load a model and start chatting. You can attach images, videos, audio, and text files for multimodal inference.</p>
       <button class="primary" onclick="showSettings()">Load Model</button>
     </div>
   </div>
@@ -808,7 +808,7 @@ <h1>
         </div>
       </div>
     </div>
-    <input type="file" id="file-input" accept="image/*,video/*,audio/*" onchange="handleFileSelect(event)">
+    <input type="file" id="file-input" accept="image/*,video/*,audio/*,.txt,.csv,.json,.xml,.md,.log,.py,.js,.ts,.cs,.java,.cpp,.c,.h,.html,.css,.yaml,.yml,.toml,.ini,.cfg,.sh,.bat,.ps1,.rb,.go,.rs,.swift,.kt,.sql,.r,.m,.tex,.rtf" onchange="handleFileSelect(event)">
   </div>
 </div>
 
@@ -819,6 +819,12 @@ <h2>Load Model</h2>
       <label>Model</label>
       <select id="model-select"><option>Loading...</option></select>
     </div>
+    <div class="field">
+      <label>Vision Model (mmproj)</label>
+      <select id="mmproj-select">
+        <option value="auto">Auto-detect</option>
+      </select>
+    </div>
     <div class="field">
       <label>Backend</label>
       <select id="backend-select">
@@ -885,6 +891,7 @@ <h2>Load Model</h2>
     availableModels = data.models || [];
     currentLoadedModel = data.loaded || null;
     populateBackendSelect(data.supportedBackends || [], data.loadedBackend || data.defaultBackend);
+    populateMmProjSelect(data.mmProjModels || [], data.loadedMmProj);
 
     const select = document.getElementById('model-select');
     select.innerHTML = '';
@@ -937,6 +944,37 @@ <h2>Load Model</h2>
   select.value = selectedValue;
 }
 
+function populateMmProjSelect(mmProjModels, loadedMmProj) {
+  const select = document.getElementById('mmproj-select');
+  const currentValue = select.value;
+  select.innerHTML = '';
+
+  const autoOpt = document.createElement('option');
+  autoOpt.value = 'auto';
+  autoOpt.textContent = 'Auto-detect';
+  select.appendChild(autoOpt);
+
+  const noneOpt = document.createElement('option');
+  noneOpt.value = 'none';
+  noneOpt.textContent = 'None (text only)';
+  select.appendChild(noneOpt);
+
+  mmProjModels.forEach(m => {
+    const opt = document.createElement('option');
+    opt.value = m;
+    opt.textContent = m;
+    select.appendChild(opt);
+  });
+
+  if (loadedMmProj && mmProjModels.includes(loadedMmProj)) {
+    select.value = loadedMmProj;
+  } else if (currentValue && [...select.options].some(o => o.value === currentValue)) {
+    select.value = currentValue;
+  } else {
+    select.value = 'auto';
+  }
+}
+
 function hasActiveChat() {
   return chatHistory.length > 0;
 }
@@ -956,6 +994,7 @@ <h2>Load Model</h2>
 function updateSettingsLockState() {
   const lockModelSelection = hasActiveChat();
   const modelSelect = document.getElementById('model-select');
+  const mmProjSelect = document.getElementById('mmproj-select');
   const backendSelect = document.getElementById('backend-select');
   const loadButton = document.getElementById('btn-load');
   const settingsHint = document.getElementById('settings-hint');
@@ -970,6 +1009,10 @@ <h2>Load Model</h2>
     modelSelect.disabled = lockModelSelection || availableModels.length === 0;
   }
 
+  if (mmProjSelect) {
+    mmProjSelect.disabled = lockModelSelection;
+  }
+
   if (backendSelect) {
     backendSelect.disabled = lockModelSelection || availableBackends.length === 0;
   }
@@ -986,6 +1029,7 @@ <h2>Load Model</h2>
   }
 
   const model = document.getElementById('model-select').value;
+  const mmproj = document.getElementById('mmproj-select').value;
   const backend = document.getElementById('backend-select').value;
   if (!backend) {
     alert('No supported backend is available on this machine.');
@@ -995,16 +1039,27 @@ <h2>Load Model</h2>
   btn.disabled = true;
   btn.textContent = 'Loading...';
 
+  const payload = { model, backend };
+  if (mmproj === 'none') {
+    payload.mmproj = 'none';
+  } else if (mmproj !== 'auto') {
+    payload.mmproj = mmproj;
+  }
+
   try {
     const res = await fetch('/api/models/load', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ model, backend })
+      body: JSON.stringify(payload)
     });
     const data = await res.json();
     if (data.ok) {
       currentLoadedModel = data.model;
-      statusBadge.textContent = data.model + ' (' + (data.architecture || '?') + ')';
+      let statusText = data.model + ' (' + (data.architecture || '?') + ')';
+      if (data.loadedMmProj) {
+        statusText += ' + ' + data.loadedMmProj;
+      }
+      statusBadge.textContent = statusText;
       statusBadge.classList.add('loaded');
       updateInlineModelStatus();
       hideSettings();
@@ -1031,14 +1086,14 @@ <h2>Load Model</h2>
     chatContainer.innerHTML = `
       <div class="empty-state" id="empty-state">
         <img class="big-logo" src="/images/banner_1.png" alt="TensorSharp Chat logo">
-        <p>Load a model and start chatting. You can attach images, videos, and audio files for multimodal inference.</p>
+        <p>Load a model and start chatting. You can attach images, videos, audio, and text files for multimodal inference.</p>
         <button class="primary" onclick="showSettings()">Load Model</button>
       </div>`;
   } else {
     chatContainer.innerHTML = `
       <div class="empty-state">
         <img class="big-logo assistant-preview" src="/images/assistant_logo.png" alt="Assistant icon">
-        <p>Start a conversation with the loaded model. To switch models, start a new chat first. You can still attach images, videos, and audio files.</p>
+        <p>Start a conversation with the loaded model. To switch models, start a new chat first. You can still attach images, videos, audio, and text files.</p>
       </div>`;
   }
 }
@@ -1057,6 +1112,9 @@ <h2>Load Model</h2>
     if (data.ok) {
       pendingAttachments.push(data);
       renderAttachments();
+      if (data.truncated) {
+        alert(`Note: "${data.fileName}" was truncated to fit within the context limit. Only the first portion of the file will be sent to the model.`);
+      }
     } else {
       alert('Upload failed: ' + data.error);
     }
@@ -1074,6 +1132,7 @@ <h2>Load Model</h2>
     if (att.mediaType === 'image') icon = '&#x1F5BC;';
     else if (att.mediaType === 'video') icon = '&#x1F3AC;';
     else if (att.mediaType === 'audio') icon = '&#x1F3B5;';
+    else if (att.mediaType === 'text') icon = '&#x1F4DD;';
     chip.innerHTML = `${icon} ${att.fileName} <span class="remove" onclick="removeAttachment(${idx})">&times;</span>`;
     attachmentsDiv.appendChild(chip);
   });
@@ -1099,6 +1158,7 @@ <h2>Load Model</h2>
   const imagePaths = [];
   const audioPaths = [];
   let isVideo = false;
+  const textParts = [];
   const attachmentsCopy = [...pendingAttachments];
 
   attachmentsCopy.forEach(att => {
@@ -1109,9 +1169,16 @@ <h2>Load Model</h2>
       if (att.framePaths) att.framePaths.forEach(f => imagePaths.push(f));
     } else if (att.mediaType === 'audio') {
       audioPaths.push(att.path);
+    } else if (att.mediaType === 'text' && att.textContent) {
+      textParts.push(`[File: ${att.fileName}]\n${att.textContent}\n[End of file]`);
     }
   });
 
+  if (textParts.length > 0) {
+    const fileContext = textParts.join('\n\n');
+    msg.content = fileContext + '\n\n' + msg.content;
+  }
+
   if (imagePaths.length > 0) msg.imagePaths = imagePaths;
   if (audioPaths.length > 0) msg.audioPaths = audioPaths;
   if (isVideo) msg.isVideo = true;
@@ -1221,7 +1288,12 @@ <h2>Load Model</h2>
             chatContainer.scrollTop = chatContainer.scrollHeight;
           }
           if (data.done) {
-            statsDiv.textContent = `${data.tokenCount} tokens \u00B7 ${data.elapsed.toFixed(1)}s \u00B7 ${data.tokPerSec.toFixed(1)} tok/s`;
+            if (data.error) {
+              bubbleText.textContent = 'Error: ' + data.error;
+              statsDiv.textContent = 'Inference failed';
+            } else {
+              statsDiv.textContent = `${data.tokenCount} tokens \u00B7 ${data.elapsed.toFixed(1)}s \u00B7 ${data.tokPerSec.toFixed(1)} tok/s`;
+            }
           }
         } catch {}
       }
@@ -1383,6 +1455,7 @@ <h2>Load Model</h2>
   if (types.includes('video')) return 'What is happening in this video?';
   if (types.includes('audio')) return 'Listen to this audio and describe what you hear.';
   if (types.includes('image')) return 'What is in this image?';
+  if (types.includes('text')) return 'Please analyze the attached text file and summarize its content.';
   return 'Describe the attached file.';
 }
 
@@ -1399,6 +1472,8 @@ <h2>Load Model</h2>
       mediaHtml += `<div class="media-preview"><div class="audio-label">&#x1F3AC; ${att.fileName}</div></div>`;
     } else if (att.mediaType === 'audio') {
       mediaHtml += `<div class="media-preview"><div class="audio-label">&#x1F3B5; ${att.fileName}</div></div>`;
+    } else if (att.mediaType === 'text') {
+      mediaHtml += `<div class="media-preview"><div class="audio-label">&#x1F4DD; ${att.fileName}</div></div>`;
     }
   });
 
diff --git a/TensorSharp.slnx b/TensorSharp.slnx
index 8736642..42a2188 100644
--- a/TensorSharp.slnx
+++ b/TensorSharp.slnx
@@ -1,9 +1,10 @@
 <Solution>
   <Project Path="AdvUtils/AdvUtils.csproj" />
-  <Project Path="InferenceConsole/InferenceConsole.csproj" />
-  <Project Path="InferenceEngine/InferenceEngine.csproj" />
-  <Project Path="InferenceWeb/InferenceWeb.csproj" />
   <Project Path="InferenceWeb.Tests/InferenceWeb.Tests.csproj" />
-  <Project Path="TensorSharp.GGML/TensorSharp.GGML.csproj" />
-  <Project Path="TensorSharp/TensorSharp.csproj" />
+  <Project Path="TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj" />
+  <Project Path="TensorSharp.Cli/TensorSharp.Cli.csproj" />
+  <Project Path="TensorSharp.Core/TensorSharp.Core.csproj" />
+  <Project Path="TensorSharp.Models/TensorSharp.Models.csproj" />
+  <Project Path="TensorSharp.Runtime/TensorSharp.Runtime.csproj" />
+  <Project Path="TensorSharp.Server/TensorSharp.Server.csproj" />
 </Solution>
diff --git a/docs/model_cards.md b/docs/model_cards.md
index f0c19e7..07fa8ce 100644
--- a/docs/model_cards.md
+++ b/docs/model_cards.md
@@ -1,10 +1,10 @@
-# Model Architecture Cards — Developer Reference
+﻿# Model Architecture Cards — Developer Reference
 
 [English](model_cards.md) | [中文](model_cards_cn.md)
 
-TensorSharp supports six model architectures. This document is a developer reference for engineers who need to modify, optimize, or extend the model implementations.
+TensorSharp supports seven model architectures. This document is a developer reference for engineers who need to modify, optimize, or extend the model implementations.
 
-All model classes live under `InferenceEngine/Models/<Name>/` and inherit from `ModelBase` (in `InferenceEngine/ModelBase.cs`). `ModelBase` provides shared primitives: GGUF loading, weight storage (`_weights` for F32, `_quantWeights` for quantized), KV cache helpers, embedding lookup, RMSNorm, linear forward, RoPE utilities, timing instrumentation, and the `Forward(int[] tokens) → float[]` interface.
+All model classes live under `TensorSharp.Models/Models/<Name>/` and inherit from `ModelBase` (in `TensorSharp.Models/ModelBase.cs`). `ModelBase` provides shared primitives: GGUF loading, weight storage (`_weights` for F32, `_quantWeights` for quantized), KV cache helpers, embedding lookup, RMSNorm, linear forward, RoPE utilities, timing instrumentation, and the `Forward(int[] tokens) → float[]` interface.
 
 ---
 
@@ -12,7 +12,7 @@ All model classes live under `InferenceEngine/Models/<Name>/` and inherit from `
 
 | Property | Value |
 |---|---|
-| Source file | `InferenceEngine/Models/Gemma3/Gemma3Model.cs` |
+| Source file | `TensorSharp.Models/Models/Gemma3/Gemma3Model.cs` |
 | Provider | Google |
 | GGUF architecture key | `gemma3` |
 | Example models | gemma-3-4b, gemma-3-12b, gemma-3-27b |
@@ -101,7 +101,7 @@ hidden → RMSNorm(output_norm) → LM head → [softcap] → logits
 
 | Property | Value |
 |---|---|
-| Source file | `InferenceEngine/Models/Gemma4/Gemma4Model.cs` |
+| Source file | `TensorSharp.Models/Models/Gemma4/Gemma4Model.cs` |
 | Provider | Google |
 | GGUF architecture key | `gemma4` |
 | Example models | gemma-4-E4B, gemma-4-31B, gemma-4-26B-A4B (MoE) |
@@ -218,7 +218,7 @@ Enabled automatically when all layers are dense (no MoE) and all weights are qua
 
 | Property | Value |
 |---|---|
-| Source file | `InferenceEngine/Models/Qwen3/Qwen3Model.cs` |
+| Source file | `TensorSharp.Models/Models/Qwen3/Qwen3Model.cs` |
 | Provider | Alibaba |
 | GGUF architecture key | `qwen3` |
 | Example models | Qwen3-4B, Qwen3-8B, Qwen3-14B, Qwen3-32B |
@@ -289,7 +289,7 @@ hidden → RMSNorm(output_norm) → narrow to last token → LM head → logits
 
 | Property | Value |
 |---|---|
-| Source file | `InferenceEngine/Models/Qwen35/Qwen35Model.cs` |
+| Source file | `TensorSharp.Models/Models/Qwen35/Qwen35Model.cs` |
 | Provider | Alibaba |
 | GGUF architecture key | `qwen35`, `qwen35moe`, `qwen3next` |
 | Example models | Qwen3.5-9B, Qwen3.5-32B |
@@ -387,7 +387,7 @@ To avoid per-step allocation in the hot GDN decode path, the following buffers a
 
 | Property | Value |
 |---|---|
-| Source file | `InferenceEngine/Models/GptOss/GptOssModel.cs` |
+| Source file | `TensorSharp.Models/Models/GptOss/GptOssModel.cs` |
 | Provider | OpenAI |
 | GGUF architecture key | `gptoss`, `gpt-oss` |
 | Example models | gpt-oss-20b |
@@ -474,7 +474,7 @@ hidden → RMSNorm(output_norm) → narrow to last token → LM head → logits
 
 | Property | Value |
 |---|---|
-| Source file | `InferenceEngine/Models/Nemotron/NemotronModel.cs` |
+| Source file | `TensorSharp.Models/Models/Nemotron/NemotronModel.cs` |
 | Provider | NVIDIA |
 | GGUF architecture key | `nemotron_h`, `nemotron_h_moe` |
 | Example models | Nemotron-H-8B-Reasoning-128K, Nemotron-H-47B-Reasoning-128K |
@@ -619,44 +619,155 @@ When running on a GGML backend (Metal/CUDA), MoE expert computation during decod
 
 ---
 
+## Mistral 3
+
+| Property | Value |
+|---|---|
+| Source file | `TensorSharp.Models/Models/Mistral3/Mistral3Model.cs` |
+| Provider | Mistral AI |
+| GGUF architecture key | `mistral3` |
+| Example models | Mistral-Small-3.1-24B-Instruct |
+| Modalities | Text, Image |
+| Thinking mode | No |
+| Tool calling | No |
+| Output parser | `PassthroughOutputParser` |
+
+### Architecture Overview
+
+Mistral 3 is a standard LLaMA-like dense transformer with several distinctive features.
+
+- **Attention**: GQA with `Config.NumKVHeads < Config.NumHeads`. Supports fused QKV projection into a single `attn_qkv.weight`, with fallback to separate Q/K/V weights.
+- **No QK-norm**: unlike Qwen 3 and Gemma 3/4, Mistral 3 does not apply per-head RMSNorm to Q and K.
+- **RoPE**: GPT-J (norm) style — pairs adjacent elements `(x[2i], x[2i+1])`. Uses YaRN scaling for extended context: frequency correction interpolates between extrapolated and interpolated frequencies based on slow/fast rotation ranges.
+- **Position-dependent Q scaling**: `q *= (1 + beta * log(1 + floor(pos / orig_ctx)))`. Only active when `_ropeOrigCtx > 0`. This scaling helps maintain attention quality at positions beyond the original training context length.
+- **FFN**: SwiGLU — `SiLU(gate) * up` via `Ops.SiLUMul`, then `down`. Uses fused `ffn_gate_up.weight`.
+- **Normalization**: two RMSNorm per block — `attn_norm` and `ffn_norm`.
+- **Output**: tied to `token_embd.weight` when `output.weight` is absent.
+- **Vision**: `Mistral3VisionEncoder` (Pixtral architecture) loaded separately via `LoadVisionEncoder(mmProjPath)`. Features Conv2D patch embedding, RMSNorm, 2D RoPE positional embeddings, SiLU-gated MLP transformer blocks, spatial patch merging, and a multi-modal projector (RMSNorm → PatchMerger → Linear → GELU → Linear). Vision embeddings are injected into the text hidden state at image token positions.
+- **Image processing**: `Mistral3ImageProcessor` composites transparent images over white background, resizes to fit `longest_edge` while preserving aspect ratio, pads to be divisible by `patch_size`, and normalizes with CLIP default mean/std.
+
+### GGUF Metadata Keys
+
+| Key | Type | Description |
+|---|---|---|
+| `mistral3.rope.dimension_count` | uint32 | RoPE dimension count |
+| `mistral3.rope.scaling.type` | string | RoPE scaling type (e.g. `yarn`) |
+| `mistral3.attention.temperature_scale` | float32 | Position-dependent Q scaling beta (default 0.1) |
+| `mistral3.rope.scaling.original_context_length` | uint32 | YaRN original context length |
+| `mistral3.rope.scaling.extrapolation_factor` | float32 | YaRN extrapolation factor (default 1.0) |
+| `mistral3.rope.scaling.yarn_beta_fast` | float32 | YaRN fast rotation threshold (default 32.0) |
+| `mistral3.rope.scaling.yarn_beta_slow` | float32 | YaRN slow rotation threshold (default 1.0) |
+| `mistral3.rope.scaling.mscale` | float32 | YaRN mscale (default 0) |
+| `mistral3.rope.scaling.mscale_all_dim` | float32 | YaRN mscale_all_dim (default 0) |
+
+### Weight Naming Convention
+
+```
+token_embd.weight                         # Token embedding [vocab, hidden]
+blk.{L}.attn_norm.weight                  # Pre-attention RMSNorm
+blk.{L}.attn_q.weight                    # Q projection (before fusion)
+blk.{L}.attn_k.weight                    # K projection (before fusion)
+blk.{L}.attn_v.weight                    # V projection (before fusion)
+blk.{L}.attn_qkv.weight                  # Fused Q+K+V (after fusion)
+blk.{L}.attn_output.weight               # Output projection
+blk.{L}.ffn_norm.weight                  # Pre-FFN RMSNorm
+blk.{L}.ffn_gate.weight  }               # Before fusion: separate gate/up
+blk.{L}.ffn_up.weight    }
+blk.{L}.ffn_gate_up.weight               # After fusion: concatenated [2*intermed, hidden]
+blk.{L}.ffn_down.weight                  # FFN down projection
+output_norm.weight                       # Final RMSNorm
+output.weight                            # LM head (optional if tied)
+```
+
+### Vision Encoder Weights (Pixtral)
+
+```
+v.patch_conv.weight                      # Conv2D patch embedding [hidden, C, P, P]
+v.patch_conv.bias                        # Conv2D bias (optional)
+v.encoder_norm.weight                    # Encoder input RMSNorm
+v.blk.{L}.attn_norm.weight              # Pre-attention RMSNorm
+v.blk.{L}.attn_q.weight                 # Q projection
+v.blk.{L}.attn_k.weight                 # K projection
+v.blk.{L}.attn_v.weight                 # V projection
+v.blk.{L}.attn_output.weight            # Output projection
+v.blk.{L}.ffn_norm.weight               # Pre-FFN RMSNorm
+v.blk.{L}.ffn_gate.weight               # SiLU gate
+v.blk.{L}.ffn_up.weight                 # Up projection
+v.blk.{L}.ffn_down.weight               # Down projection
+mm.norm.weight                           # Projector RMSNorm
+mm.patch_merger.merging_layer.weight     # Spatial patch merger
+mm.linear_1.weight                       # Projector linear 1
+mm.linear_2.weight                       # Projector linear 2
+```
+
+### Forward Pass (per token)
+
+```
+tokens → Embedding → [InjectVision]
+For each layer L:
+  hidden → RMSNorm(attn_norm)
+         → QKV (fused or separate) → RoPE(GPT-J style, YaRN) → [PositionScale(Q)]
+         → Attention(full causal) → O projection
+         → residual add
+         → RMSNorm(ffn_norm)
+         → GateUp → SiLU(gate)*up → Down
+         → residual add
+hidden → RMSNorm(output_norm) → narrow to last token → LM head → logits
+```
+
+### KV Cache
+
+- Shape: `[numKVHeads, maxSeqLen, headDim]` per layer (separate key/value lengths supported: `_attnKeyLen` and `_attnValLen`).
+- `ResetKVCache()` fills all caches with 0 and calls `InvalidateTensorDeviceCache()` to sync GPU state.
+
+### Optimization Opportunities
+
+- No native decode path. The entire forward pass is managed C# with GGML-backed matmul. A native single-call decode path (like Qwen 3) would reduce managed overhead.
+- No fused GPU decode path. A single-graph approach (like Gemma 4) would significantly improve Metal/CUDA throughput.
+- Vision encoder dequantizes all weights to F32 at load time. Supporting quantized weights directly would reduce memory usage.
+
+---
+
 ## Architecture Comparison
 
-| Feature | Gemma 3 | Gemma 4 | Qwen 3 | Qwen 3.5 | GPT OSS | Nemotron-H |
-|---|---|---|---|---|---|---|
-| Layer type | Dense | Dense / MoE | Dense | Hybrid (Attn + Recurrent) | MoE | Hybrid (Mamba2 + Attn + MoE FFN) |
-| Attention | SWA + Global | SWA + Global | Full GQA | Full GQA + Gated | Full + Sinks | Full GQA (no RoPE) |
-| FFN activation | GeGLU | GeGLU | SwiGLU | SwiGLU | SiLUAlphaLimit | ReLU² |
-| RoPE variant | NeoX (dual base) | NeoX + proportional | NeoX | NeoX / MRoPE | NeoX + Yarn | None |
-| QK norm | Yes | Yes | Yes | Yes | No | No |
-| V norm | No | Yes (unweighted) | No | No | No | No |
-| Bias in projections | No | No | No | No | Yes (all) | No |
-| Per-layer scaling | No | Yes | No | No | No | No |
-| PLE | No | Yes | No | No | No | No |
-| KV sharing | No | Yes | No | No | No | No |
-| Attention sinks | No | No | No | No | Yes | No |
-| Circular KV cache | No | Yes (SWA layers) | No | No | No | No |
-| SSM (Mamba2) | No | No | No | No | No | Yes |
-| Shared experts | No | No | No | No | No | Yes (optional) |
-| Latent bottleneck | No | No | No | No | No | Yes (optional) |
-| Vision | Yes | Yes | No | Yes | No | No |
-| Audio | No | Yes | No | No | No | No |
-| Video | No | Yes | No | No | No | No |
-| Thinking | No | Yes | Yes | Yes | Yes (always) | Yes |
-| Tool calling | No | Yes | Yes | Yes | No | Yes |
-| Fused QKV | No | Yes | Yes | No | No | Yes |
-| Fused GPU decode | No | Yes (Metal) | No | No | No | No |
-| Native model decode | No | No | Yes | No | No | No |
-| Batched GPU MoE | No | No | No | No | No | Yes |
-| Output parser | Passthrough | Gemma4 | Qwen3 | Qwen35 | Harmony (always on) | Qwen3 |
+| Feature | Gemma 3 | Gemma 4 | Qwen 3 | Qwen 3.5 | GPT OSS | Nemotron-H | Mistral 3 |
+|---|---|---|---|---|---|---|---|
+| Layer type | Dense | Dense / MoE | Dense | Hybrid (Attn + Recurrent) | MoE | Hybrid (Mamba2 + Attn + MoE FFN) | Dense |
+| Attention | SWA + Global | SWA + Global | Full GQA | Full GQA + Gated | Full + Sinks | Full GQA (no RoPE) | Full GQA |
+| FFN activation | GeGLU | GeGLU | SwiGLU | SwiGLU | SiLUAlphaLimit | ReLU² | SwiGLU |
+| RoPE variant | NeoX (dual base) | NeoX + proportional | NeoX | NeoX / MRoPE | NeoX + Yarn | None | GPT-J + YaRN |
+| QK norm | Yes | Yes | Yes | Yes | No | No | No |
+| V norm | No | Yes (unweighted) | No | No | No | No | No |
+| Bias in projections | No | No | No | No | Yes (all) | No | No |
+| Per-layer scaling | No | Yes | No | No | No | No | No |
+| PLE | No | Yes | No | No | No | No | No |
+| KV sharing | No | Yes | No | No | No | No | No |
+| Attention sinks | No | No | No | No | Yes | No | No |
+| Circular KV cache | No | Yes (SWA layers) | No | No | No | No | No |
+| SSM (Mamba2) | No | No | No | No | No | Yes | No |
+| Shared experts | No | No | No | No | No | Yes (optional) | No |
+| Latent bottleneck | No | No | No | No | No | Yes (optional) | No |
+| Position-dependent Q scaling | No | No | No | No | No | No | Yes (YaRN) |
+| Vision | Yes | Yes | No | Yes | No | No | Yes |
+| Audio | No | Yes | No | No | No | No | No |
+| Video | No | Yes | No | No | No | No | No |
+| Thinking | No | Yes | Yes | Yes | Yes (always) | Yes | No |
+| Tool calling | No | Yes | Yes | Yes | No | Yes | No |
+| Fused QKV | No | Yes | Yes | No | No | Yes | Yes |
+| Fused GPU decode | No | Yes (Metal) | No | No | No | No | No |
+| Native model decode | No | No | Yes | No | No | No | No |
+| Batched GPU MoE | No | No | No | No | No | Yes | No |
+| Output parser | Passthrough | Gemma4 | Qwen3 | Qwen35 | Harmony (always on) | Qwen3 | Passthrough |
 
 ---
 
 ## Adding a New Model Architecture
 
-1. Create `InferenceEngine/Models/<Name>/<Name>Model.cs` inheriting `ModelBase`.
+1. Create `TensorSharp.Models/Models/<Name>/<Name>Model.cs` inheriting `ModelBase`.
 2. In the constructor: read GGUF metadata via `_gguf.GetXxx()`, call `ParseBaseConfig()` and `ParseTokenizer()`, call `LoadWeights()`, then fuse weights and init caches.
 3. Implement `Forward(int[] tokens) → float[]`: embedding → transformer blocks → norm → LM head → logits copy.
 4. Implement `ResetKVCache()` and `Dispose()`.
 5. Register in `ModelBase.Create()` switch expression (in `ModelBase.cs`).
 6. Add an `IOutputParser` implementation in `OutputParser.cs` if the model uses a non-standard output format. Register in `OutputParserFactory.Create()`. Set `AlwaysRequired = true` if the model always wraps output in structural tags.
 7. Add chat template support in `ChatTemplate.cs` / `Jinja2Template.cs` if the model uses a novel template format.
+
diff --git a/docs/model_cards_cn.md b/docs/model_cards_cn.md
index 8a136b3..48b6921 100644
--- a/docs/model_cards_cn.md
+++ b/docs/model_cards_cn.md
@@ -1,10 +1,10 @@
-# 模型架构卡片 — 开发者参考
+﻿# 模型架构卡片 — 开发者参考
 
 [English](model_cards.md) | [中文](model_cards_cn.md)
 
-TensorSharp 支持六种模型架构。本文档面向需要修改、优化或扩展模型实现的工程师。
+TensorSharp 支持七种模型架构。本文档面向需要修改、优化或扩展模型实现的工程师。
 
-所有模型类位于 `InferenceEngine/Models/<Name>/` 目录下，继承自 `ModelBase`（`InferenceEngine/ModelBase.cs`）。`ModelBase` 提供共享基础设施：GGUF 加载、权重存储（`_weights` 存 F32、`_quantWeights` 存量化权重）、KV 缓存辅助方法、嵌入查询、RMSNorm、线性前向、RoPE 工具函数、性能计时，以及 `Forward(int[] tokens) → float[]` 接口。
+所有模型类位于 `TensorSharp.Models/Models/<Name>/` 目录下，继承自 `ModelBase`（`TensorSharp.Models/ModelBase.cs`）。`ModelBase` 提供共享基础设施：GGUF 加载、权重存储（`_weights` 存 F32、`_quantWeights` 存量化权重）、KV 缓存辅助方法、嵌入查询、RMSNorm、线性前向、RoPE 工具函数、性能计时，以及 `Forward(int[] tokens) → float[]` 接口。
 
 ---
 
@@ -12,7 +12,7 @@ TensorSharp 支持六种模型架构。本文档面向需要修改、优化或
 
 | 属性 | 值 |
 |---|---|
-| 源文件 | `InferenceEngine/Models/Gemma3/Gemma3Model.cs` |
+| 源文件 | `TensorSharp.Models/Models/Gemma3/Gemma3Model.cs` |
 | 提供方 | Google |
 | GGUF 架构标识 | `gemma3` |
 | 示例模型 | gemma-3-4b、gemma-3-12b、gemma-3-27b |
@@ -101,7 +101,7 @@ hidden → RMSNorm(output_norm) → LM head → [软封顶] → logits
 
 | 属性 | 值 |
 |---|---|
-| 源文件 | `InferenceEngine/Models/Gemma4/Gemma4Model.cs` |
+| 源文件 | `TensorSharp.Models/Models/Gemma4/Gemma4Model.cs` |
 | 提供方 | Google |
 | GGUF 架构标识 | `gemma4` |
 | 示例模型 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B（MoE）|
@@ -218,7 +218,7 @@ hidden → RMSNorm(output_norm) → LM head → [软封顶] → logits
 
 | 属性 | 值 |
 |---|---|
-| 源文件 | `InferenceEngine/Models/Qwen3/Qwen3Model.cs` |
+| 源文件 | `TensorSharp.Models/Models/Qwen3/Qwen3Model.cs` |
 | 提供方 | 阿里巴巴 |
 | GGUF 架构标识 | `qwen3` |
 | 示例模型 | Qwen3-4B、Qwen3-8B、Qwen3-14B、Qwen3-32B |
@@ -289,7 +289,7 @@ hidden → RMSNorm(output_norm) → 截取最后 token → LM head → logits
 
 | 属性 | 值 |
 |---|---|
-| 源文件 | `InferenceEngine/Models/Qwen35/Qwen35Model.cs` |
+| 源文件 | `TensorSharp.Models/Models/Qwen35/Qwen35Model.cs` |
 | 提供方 | 阿里巴巴 |
 | GGUF 架构标识 | `qwen35`、`qwen35moe`、`qwen3next` |
 | 示例模型 | Qwen3.5-9B、Qwen3.5-32B |
@@ -387,7 +387,7 @@ blk.{L}.ffn_gate_up.weight / ffn_down.weight
 
 | 属性 | 值 |
 |---|---|
-| 源文件 | `InferenceEngine/Models/GptOss/GptOssModel.cs` |
+| 源文件 | `TensorSharp.Models/Models/GptOss/GptOssModel.cs` |
 | 提供方 | OpenAI |
 | GGUF 架构标识 | `gptoss`、`gpt-oss` |
 | 示例模型 | gpt-oss-20b |
@@ -474,7 +474,7 @@ hidden → RMSNorm(output_norm) → 截取最后 token → LM head → logits
 
 | 属性 | 值 |
 |---|---|
-| 源文件 | `InferenceEngine/Models/Nemotron/NemotronModel.cs` |
+| 源文件 | `TensorSharp.Models/Models/Nemotron/NemotronModel.cs` |
 | 提供方 | NVIDIA |
 | GGUF 架构标识 | `nemotron_h`、`nemotron_h_moe` |
 | 示例模型 | Nemotron-H-8B-Reasoning-128K、Nemotron-H-47B-Reasoning-128K |
@@ -619,44 +619,155 @@ hidden → RMSNorm(output_norm) → 截取最后 token → LM head → logits
 
 ---
 
+## Mistral 3
+
+| 属性 | 值 |
+|---|---|
+| 源文件 | `TensorSharp.Models/Models/Mistral3/Mistral3Model.cs` |
+| 提供方 | Mistral AI |
+| GGUF 架构标识 | `mistral3` |
+| 示例模型 | Mistral-Small-3.1-24B-Instruct |
+| 模态 | 文本、图像 |
+| 思维链模式 | 不支持 |
+| 工具调用 | 不支持 |
+| 输出解析器 | `PassthroughOutputParser` |
+
+### 架构概述
+
+Mistral 3 是一个标准的 LLaMA 风格密集型 Transformer，具有几个显著特性。
+
+- **注意力**：GQA，`Config.NumKVHeads < Config.NumHeads`。支持融合 QKV 投影为单个 `attn_qkv.weight`，也可回退到独立的 Q/K/V 权重。
+- **无 QK 归一化**：与 Qwen 3 和 Gemma 3/4 不同，Mistral 3 不对 Q 和 K 应用逐头 RMSNorm。
+- **RoPE**：GPT-J（norm）风格——配对相邻元素 `(x[2i], x[2i+1])`。使用 YaRN 缩放实现扩展上下文：频率校正在外推频率和插值频率之间根据慢/快旋转范围进行插值。
+- **位置依赖的 Q 缩放**：`q *= (1 + beta * log(1 + floor(pos / orig_ctx)))`。仅在 `_ropeOrigCtx > 0` 时启用。此缩放有助于在超出原始训练上下文长度的位置保持注意力质量。
+- **FFN**：SwiGLU——`SiLU(gate) * up`（通过 `Ops.SiLUMul`），然后 `down`。使用融合的 `ffn_gate_up.weight`。
+- **归一化**：每块两个 RMSNorm——`attn_norm` 和 `ffn_norm`。
+- **输出**：当 `output.weight` 不存在时，与 `token_embd.weight` 绑定。
+- **视觉**：`Mistral3VisionEncoder`（Pixtral 架构）通过 `LoadVisionEncoder(mmProjPath)` 单独加载。包含 Conv2D 补丁嵌入、RMSNorm、2D RoPE 位置嵌入、SiLU 门控 MLP Transformer 块、空间补丁合并，以及多模态投影器（RMSNorm → PatchMerger → Linear → GELU → Linear）。视觉嵌入在图像 token 位置注入文本隐状态。
+- **图像处理**：`Mistral3ImageProcessor` 将透明图像合成到白色背景上，按 `longest_edge` 缩放并保持宽高比，填充至可被 `patch_size` 整除，使用 CLIP 默认均值/标准差归一化。
+
+### GGUF 元数据键
+
+| 键 | 类型 | 说明 |
+|---|---|---|
+| `mistral3.rope.dimension_count` | uint32 | RoPE 维度数 |
+| `mistral3.rope.scaling.type` | string | RoPE 缩放类型（如 `yarn`）|
+| `mistral3.attention.temperature_scale` | float32 | 位置依赖 Q 缩放 beta（默认 0.1）|
+| `mistral3.rope.scaling.original_context_length` | uint32 | YaRN 原始上下文长度 |
+| `mistral3.rope.scaling.extrapolation_factor` | float32 | YaRN 外推因子（默认 1.0）|
+| `mistral3.rope.scaling.yarn_beta_fast` | float32 | YaRN 快旋转阈值（默认 32.0）|
+| `mistral3.rope.scaling.yarn_beta_slow` | float32 | YaRN 慢旋转阈值（默认 1.0）|
+| `mistral3.rope.scaling.mscale` | float32 | YaRN mscale（默认 0）|
+| `mistral3.rope.scaling.mscale_all_dim` | float32 | YaRN mscale_all_dim（默认 0）|
+
+### 权重命名规范
+
+```
+token_embd.weight                         # Token 嵌入 [vocab, hidden]
+blk.{L}.attn_norm.weight                  # 注意力前 RMSNorm
+blk.{L}.attn_q.weight                    # Q 投影（融合前）
+blk.{L}.attn_k.weight                    # K 投影（融合前）
+blk.{L}.attn_v.weight                    # V 投影（融合前）
+blk.{L}.attn_qkv.weight                  # 融合 Q+K+V（融合后）
+blk.{L}.attn_output.weight               # 输出投影
+blk.{L}.ffn_norm.weight                  # FFN 前 RMSNorm
+blk.{L}.ffn_gate.weight  }               # 融合前：独立 gate/up
+blk.{L}.ffn_up.weight    }
+blk.{L}.ffn_gate_up.weight               # 融合后：拼接 [2*intermed, hidden]
+blk.{L}.ffn_down.weight                  # FFN down 投影
+output_norm.weight                       # 最终 RMSNorm
+output.weight                            # LM head（绑定时可选）
+```
+
+### 视觉编码器权重（Pixtral）
+
+```
+v.patch_conv.weight                      # Conv2D 补丁嵌入 [hidden, C, P, P]
+v.patch_conv.bias                        # Conv2D 偏置（可选）
+v.encoder_norm.weight                    # 编码器输入 RMSNorm
+v.blk.{L}.attn_norm.weight              # 注意力前 RMSNorm
+v.blk.{L}.attn_q.weight                 # Q 投影
+v.blk.{L}.attn_k.weight                 # K 投影
+v.blk.{L}.attn_v.weight                 # V 投影
+v.blk.{L}.attn_output.weight            # 输出投影
+v.blk.{L}.ffn_norm.weight               # FFN 前 RMSNorm
+v.blk.{L}.ffn_gate.weight               # SiLU 门控
+v.blk.{L}.ffn_up.weight                 # Up 投影
+v.blk.{L}.ffn_down.weight               # Down 投影
+mm.norm.weight                           # 投影器 RMSNorm
+mm.patch_merger.merging_layer.weight     # 空间补丁合并
+mm.linear_1.weight                       # 投影器 Linear 1
+mm.linear_2.weight                       # 投影器 Linear 2
+```
+
+### 前向传播流程（每 token）
+
+```
+tokens → Embedding → [注入视觉]
+For each layer L:
+  hidden → RMSNorm(attn_norm)
+         → QKV（融合或独立）→ RoPE(GPT-J 风格, YaRN) → [位置缩放(Q)]
+         → 注意力(全因果) → O 投影
+         → 残差相加
+         → RMSNorm(ffn_norm)
+         → GateUp → SiLU(gate)*up → Down
+         → 残差相加
+hidden → RMSNorm(output_norm) → 截取最后 token → LM head → logits
+```
+
+### KV 缓存
+
+- 形状：每层 `[numKVHeads, maxSeqLen, headDim]`（支持独立的 key/value 长度：`_attnKeyLen` 和 `_attnValLen`）。
+- `ResetKVCache()` 将所有缓存填零并调用 `InvalidateTensorDeviceCache()` 同步 GPU 状态。
+
+### 优化空间
+
+- 无原生 decode 路径。整个前向传播为托管 C# 加 GGML 支持的 matmul。原生单调用 decode 路径（类似 Qwen 3）可减少托管开销。
+- 无融合 GPU decode 路径。单图方法（类似 Gemma 4）可显著提升 Metal/CUDA 吞吐量。
+- 视觉编码器在加载时将所有权重反量化为 F32。直接支持量化权重可减少内存占用。
+
+---
+
 ## 架构对比
 
-| 特性 | Gemma 3 | Gemma 4 | Qwen 3 | Qwen 3.5 | GPT OSS | Nemotron-H |
-|---|---|---|---|---|---|---|
-| 层类型 | 密集 | 密集 / MoE | 密集 | 混合（注意力 + 循环）| MoE | 混合（Mamba2 + 注意力 + MoE FFN）|
-| 注意力 | SWA + 全局 | SWA + 全局 | 全 GQA | 全 GQA + 门控 | 全因果 + 沉降 | 全 GQA（无 RoPE）|
-| FFN 激活 | GeGLU | GeGLU | SwiGLU | SwiGLU | SiLUAlphaLimit | ReLU² |
-| RoPE 变体 | NeoX（双基数）| NeoX + 比例 | NeoX | NeoX / MRoPE | NeoX + Yarn | 无 |
-| QK 归一化 | 有 | 有 | 有 | 有 | 无 | 无 |
-| V 归一化 | 无 | 有（无权重）| 无 | 无 | 无 | 无 |
-| 投影偏置 | 无 | 无 | 无 | 无 | 有（全部）| 无 |
-| 逐层缩放 | 无 | 有 | 无 | 无 | 无 | 无 |
-| PLE | 无 | 有 | 无 | 无 | 无 | 无 |
-| KV 共享 | 无 | 有 | 无 | 无 | 无 | 无 |
-| 注意力沉降 | 无 | 无 | 无 | 无 | 有 | 无 |
-| 环形 KV 缓存 | 无 | 有（SWA 层）| 无 | 无 | 无 | 无 |
-| SSM（Mamba2）| 无 | 无 | 无 | 无 | 无 | 有 |
-| 共享专家 | 无 | 无 | 无 | 无 | 无 | 有（可选）|
-| 潜空间瓶颈 | 无 | 无 | 无 | 无 | 无 | 有（可选）|
-| 视觉 | 支持 | 支持 | 不支持 | 支持 | 不支持 | 不支持 |
-| 音频 | 不支持 | 支持 | 不支持 | 不支持 | 不支持 | 不支持 |
-| 视频 | 不支持 | 支持 | 不支持 | 不支持 | 不支持 | 不支持 |
-| 思维链 | 不支持 | 支持 | 支持 | 支持 | 支持（始终开启）| 支持 |
-| 工具调用 | 不支持 | 支持 | 支持 | 支持 | 不支持 | 支持 |
-| 融合 QKV | 无 | 有 | 有 | 无 | 无 | 有 |
-| 融合 GPU decode | 不支持 | 支持（Metal）| 不支持 | 不支持 | 不支持 | 不支持 |
-| 原生模型 decode | 不支持 | 不支持 | 支持 | 不支持 | 不支持 | 不支持 |
-| 批量 GPU MoE | 不支持 | 不支持 | 不支持 | 不支持 | 不支持 | 支持 |
-| 输出解析器 | 直通 | Gemma4 | Qwen3 | Qwen35 | Harmony（始终开启）| Qwen3 |
+| 特性 | Gemma 3 | Gemma 4 | Qwen 3 | Qwen 3.5 | GPT OSS | Nemotron-H | Mistral 3 |
+|---|---|---|---|---|---|---|---|
+| 层类型 | 密集 | 密集 / MoE | 密集 | 混合（注意力 + 循环）| MoE | 混合（Mamba2 + 注意力 + MoE FFN）| 密集 |
+| 注意力 | SWA + 全局 | SWA + 全局 | 全 GQA | 全 GQA + 门控 | 全因果 + 沉降 | 全 GQA（无 RoPE）| 全 GQA |
+| FFN 激活 | GeGLU | GeGLU | SwiGLU | SwiGLU | SiLUAlphaLimit | ReLU² | SwiGLU |
+| RoPE 变体 | NeoX（双基数）| NeoX + 比例 | NeoX | NeoX / MRoPE | NeoX + Yarn | 无 | GPT-J + YaRN |
+| QK 归一化 | 有 | 有 | 有 | 有 | 无 | 无 | 无 |
+| V 归一化 | 无 | 有（无权重）| 无 | 无 | 无 | 无 | 无 |
+| 投影偏置 | 无 | 无 | 无 | 无 | 有（全部）| 无 | 无 |
+| 逐层缩放 | 无 | 有 | 无 | 无 | 无 | 无 | 无 |
+| PLE | 无 | 有 | 无 | 无 | 无 | 无 | 无 |
+| KV 共享 | 无 | 有 | 无 | 无 | 无 | 无 | 无 |
+| 注意力沉降 | 无 | 无 | 无 | 无 | 有 | 无 | 无 |
+| 环形 KV 缓存 | 无 | 有（SWA 层）| 无 | 无 | 无 | 无 | 无 |
+| SSM（Mamba2）| 无 | 无 | 无 | 无 | 无 | 有 | 无 |
+| 共享专家 | 无 | 无 | 无 | 无 | 无 | 有（可选）| 无 |
+| 潜空间瓶颈 | 无 | 无 | 无 | 无 | 无 | 有（可选）| 无 |
+| 位置依赖 Q 缩放 | 无 | 无 | 无 | 无 | 无 | 无 | 有（YaRN）|
+| 视觉 | 支持 | 支持 | 不支持 | 支持 | 不支持 | 不支持 | 支持 |
+| 音频 | 不支持 | 支持 | 不支持 | 不支持 | 不支持 | 不支持 | 不支持 |
+| 视频 | 不支持 | 支持 | 不支持 | 不支持 | 不支持 | 不支持 | 不支持 |
+| 思维链 | 不支持 | 支持 | 支持 | 支持 | 支持（始终开启）| 支持 | 不支持 |
+| 工具调用 | 不支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | 不支持 |
+| 融合 QKV | 无 | 有 | 有 | 无 | 无 | 有 | 有 |
+| 融合 GPU decode | 不支持 | 支持（Metal）| 不支持 | 不支持 | 不支持 | 不支持 | 不支持 |
+| 原生模型 decode | 不支持 | 不支持 | 支持 | 不支持 | 不支持 | 不支持 | 不支持 |
+| 批量 GPU MoE | 不支持 | 不支持 | 不支持 | 不支持 | 不支持 | 支持 | 不支持 |
+| 输出解析器 | 直通 | Gemma4 | Qwen3 | Qwen35 | Harmony（始终开启）| Qwen3 | 直通 |
 
 ---
 
 ## 添加新模型架构
 
-1. 创建 `InferenceEngine/Models/<Name>/<Name>Model.cs`，继承 `ModelBase`。
+1. 创建 `TensorSharp.Models/Models/<Name>/<Name>Model.cs`，继承 `ModelBase`。
 2. 在构造函数中：通过 `_gguf.GetXxx()` 读取 GGUF 元数据，调用 `ParseBaseConfig()` 和 `ParseTokenizer()`，调用 `LoadWeights()`，然后融合权重并初始化缓存。
 3. 实现 `Forward(int[] tokens) → float[]`：嵌入 → Transformer 块 → 归一化 → LM head → logits 复制。
 4. 实现 `ResetKVCache()` 和 `Dispose()`。
 5. 在 `ModelBase.Create()` 的 switch 表达式中注册（`ModelBase.cs`）。
 6. 如果模型使用非标准输出格式，在 `OutputParser.cs` 中添加 `IOutputParser` 实现，在 `OutputParserFactory.Create()` 中注册。如果模型始终用结构化标签包裹输出，设置 `AlwaysRequired = true`。
 7. 如果模型使用新颖的模板格式，在 `ChatTemplate.cs` / `Jinja2Template.cs` 中添加聊天模板支持。
+
diff --git a/imgs/banner_1.png b/imgs/banner_1.png
index 6a4a87e..beffe29 100644
Binary files a/imgs/banner_1.png and b/imgs/banner_1.png differ