diff --git a/InferenceWeb.Tests/BackendCatalogTests.cs b/InferenceWeb.Tests/BackendCatalogTests.cs
index 401c8d0..d646838 100644
--- a/InferenceWeb.Tests/BackendCatalogTests.cs
+++ b/InferenceWeb.Tests/BackendCatalogTests.cs
@@ -1,6 +1,4 @@
-using InferenceEngine;
-using InferenceWeb;
-using TensorSharp.GGML;
+using TensorSharp.GGML;
namespace InferenceWeb.Tests;
@@ -121,3 +119,5 @@ public void ShouldStoreWeightQuantized_GgmlBackendsKeepQuantizedWeights()
Assert.True(shouldStoreQuantized);
}
}
+
+
diff --git a/InferenceWeb.Tests/GlobalUsings.cs b/InferenceWeb.Tests/GlobalUsings.cs
new file mode 100644
index 0000000..1f7dd10
--- /dev/null
+++ b/InferenceWeb.Tests/GlobalUsings.cs
@@ -0,0 +1,3 @@
+global using TensorSharp.Models;
+global using TensorSharp.Runtime;
+global using TensorSharp.Server;
diff --git a/InferenceWeb.Tests/ImageProcessorTests.cs b/InferenceWeb.Tests/ImageProcessorTests.cs
index 92dd8ca..d82af18 100644
--- a/InferenceWeb.Tests/ImageProcessorTests.cs
+++ b/InferenceWeb.Tests/ImageProcessorTests.cs
@@ -1,5 +1,4 @@
-using InferenceEngine;
-
+
namespace InferenceWeb.Tests;
public class ImageProcessorTests
@@ -95,3 +94,4 @@ private static string WriteEmbeddedJpeg()
return path;
}
}
+
diff --git a/InferenceWeb.Tests/InferenceWeb.Tests.csproj b/InferenceWeb.Tests/InferenceWeb.Tests.csproj
index ca583c8..90960bd 100644
--- a/InferenceWeb.Tests/InferenceWeb.Tests.csproj
+++ b/InferenceWeb.Tests/InferenceWeb.Tests.csproj
@@ -15,7 +15,8 @@
-
-
+
+
+
diff --git a/InferenceWeb.Tests/KVCacheTests.cs b/InferenceWeb.Tests/KVCacheTests.cs
index c33c2cd..7bb4eea 100644
--- a/InferenceWeb.Tests/KVCacheTests.cs
+++ b/InferenceWeb.Tests/KVCacheTests.cs
@@ -1,5 +1,4 @@
-using InferenceWeb;
-
+
namespace InferenceWeb.Tests;
public class KVCacheTests
@@ -129,3 +128,4 @@ public void FindTokenPrefixLength_ThinkingModelWithContentInContext()
Assert.Equal(8, common); // Full cached is prefix
}
}
+
diff --git a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
index ffc8a6a..64e34ab 100644
--- a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
+++ b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
@@ -1,5 +1,4 @@
-using System.Buffers.Binary;
-using InferenceEngine;
+using System.Buffers.Binary;
namespace InferenceWeb.Tests;
@@ -168,3 +167,4 @@ private static float Dot(float[] lhs, float[] rhs, int rhsOffset, int length)
return sum;
}
}
+
diff --git a/InferenceWeb.Tests/MediaHelperTests.cs b/InferenceWeb.Tests/MediaHelperTests.cs
index 4be19b3..c2263a0 100644
--- a/InferenceWeb.Tests/MediaHelperTests.cs
+++ b/InferenceWeb.Tests/MediaHelperTests.cs
@@ -1,5 +1,4 @@
-using InferenceEngine;
-
+
namespace InferenceWeb.Tests;
public class MediaHelperTests
@@ -71,3 +70,4 @@ public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride()
}
}
}
+
diff --git a/InferenceWeb.Tests/ModelServiceHistoryTests.cs b/InferenceWeb.Tests/ModelServiceHistoryTests.cs
index 4a01b1e..09d4bb8 100644
--- a/InferenceWeb.Tests/ModelServiceHistoryTests.cs
+++ b/InferenceWeb.Tests/ModelServiceHistoryTests.cs
@@ -1,6 +1,4 @@
-using InferenceEngine;
-using InferenceWeb;
-
+
namespace InferenceWeb.Tests;
public class ModelServiceHistoryTests
@@ -70,3 +68,5 @@ public void PrepareHistoryForInference_NormalizesEarlierVideoTurns()
}
}
}
+
+
diff --git a/InferenceWeb.Tests/StructuredOutputTests.cs b/InferenceWeb.Tests/StructuredOutputTests.cs
index 658b4ec..3af4bce 100644
--- a/InferenceWeb.Tests/StructuredOutputTests.cs
+++ b/InferenceWeb.Tests/StructuredOutputTests.cs
@@ -1,6 +1,4 @@
-using System.Text.Json;
-using InferenceEngine;
-using InferenceWeb;
+using System.Text.Json;
namespace InferenceWeb.Tests;
@@ -206,3 +204,5 @@ public void JsonSchemaNormalizationSupportsDefsAndAnyOf()
Assert.Equal("""{"item":{"name":"Ada","age":30}}""", normalized.NormalizedContent);
}
}
+
+
diff --git a/InferenceWeb.Tests/WebUiChatPolicyTests.cs b/InferenceWeb.Tests/WebUiChatPolicyTests.cs
index eaebe36..0ba2a5c 100644
--- a/InferenceWeb.Tests/WebUiChatPolicyTests.cs
+++ b/InferenceWeb.Tests/WebUiChatPolicyTests.cs
@@ -1,5 +1,4 @@
-using InferenceWeb;
-
+
namespace InferenceWeb.Tests;
public class WebUiChatPolicyTests
@@ -31,3 +30,4 @@ public void TryValidateChatRequest_RejectsPerTurnBackendSelection()
Assert.Equal(WebUiChatPolicy.ModelSelectionLockedMessage, error);
}
}
+
diff --git a/README.md b/README.md
index 8e05731..742ebf8 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# TensorSharp
+# TensorSharp
@@ -10,8 +10,8 @@ A C# inference engine for running large language models (LLMs) locally using GGU
## Features
-- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H
-- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5
+- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H, Mistral 3
+- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5 / Mistral 3
- **Thinking / reasoning mode** -- structured chain-of-thought output with `` / `<|channel>thought` / `<|channel>analysis` tags (Qwen 3, Qwen 3.5, Gemma 4, GPT OSS, Nemotron-H)
- **Tool calling / function calling** -- models can invoke user-defined tools; multi-turn tool-call conversations supported across all three API styles
- **Quantized model support** -- loads GGUF files with Q4_K_M, Q8_0, F16, MXFP4, and other quantization formats; performs native quantized matmul without dequantizing to FP32, including memory-efficient pure C# CPU loading for large GGUFs
@@ -38,6 +38,7 @@ A C# inference engine for running large language models (LLMs) locally using GGU
| Qwen 3.5 | Qwen3.5-9B, Qwen3.5-35B-A3B | Image | Yes | Yes |
| GPT OSS | gpt-oss-20b (MoE) | Text only | Yes | No |
| Nemotron-H | Nemotron-H-8B, Nemotron-H-47B (Hybrid SSM-Transformer, MoE) | Text only | Yes | Yes |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | Image | No | No |
See [Model Architecture Cards](docs/model_cards.md) for detailed documentation of each architecture.
@@ -58,6 +59,8 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
| GPT OSS | gpt-oss-20b (MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+| Mistral 3 | mistral3-mmproj (Pixtral vision projector) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
## Compute Backends
@@ -72,36 +75,37 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
```
TensorSharp/
-├── TensorSharp/ # Core tensor library (CPU operations, SIMD)
-├── TensorSharp.GGML/ # GGML backend bindings (Metal/CUDA/CPU via native library)
+├── TensorSharp.Core/ # Core tensor library (Tensor, Ops, memory, device abstraction)
+├── TensorSharp.Runtime/ # GGUF, tokenizers, templates, sampling, protocol parsing
+├── TensorSharp.Models/ # Model architectures and multimodal encoders/injectors
+├── TensorSharp.Backends.GGML/ # GGML backend bindings (Metal/CUDA/CPU via native library)
├── TensorSharp.GGML.Native/ # Native C++ bridge to ggml (builds libGgmlOps)
-├── AdvUtils/ # Utility library
-├── InferenceEngine/ # Model loading, tokenization, and inference logic
-│ ├── Models/
-│ │ ├── Gemma3/
-│ │ ├── Gemma4/ # Vision encoder, audio encoder, MoE, fused GPU decode
-│ │ ├── GptOss/ # MoE, attention sinks, SiLUAlphaLimit, Yarn RoPE
-│ │ ├── Nemotron/ # Hybrid Mamba2 SSM + attention + MoE FFN
-│ │ ├── Qwen3/
-│ │ └── Qwen35/
-│ ├── GgufReader.cs # GGUF file parser
-│ ├── ModelBase.cs # Base class for all model architectures
-│ ├── ChatTemplate.cs # Chat template rendering (hardcoded + Jinja2 from GGUF)
-│ ├── Jinja2Template.cs # Jinja2 template renderer
-│ ├── OutputParser.cs # Extracts thinking, content, and tool calls from model output
-│ ├── SamplingConfig.cs # Sampling parameter configuration
-│ ├── TokenSampler.cs # Token sampling (greedy, top-k, top-p, min-p, penalties)
-│ └── MediaHelper.cs # Video frame extraction, audio decoding
-├── InferenceConsole/ # CLI application
-├── InferenceWeb/ # Web chatbot + API server (ASP.NET Core)
+├── TensorSharp.Server/ # Web chatbot + API server (ASP.NET Core)
│ ├── ModelService.cs # Model lifecycle management
│ ├── InferenceQueue.cs # FIFO request queue with position tracking
│ ├── wwwroot/index.html # Chat UI
│ ├── testdata/ # Integration test suites (bash + Python)
│ └── API_EXAMPLES.md # Detailed API documentation
+├── TensorSharp.Cli/ # CLI application
+├── AdvUtils/ # Utility library
└── ExternalProjects/ # Third-party dependencies (ggml)
```
+## NuGet Packages
+
+The repository is now split along package boundaries so consumers can depend on only the layers they actually need.
+
+| Project | NuGet package | Public namespace | Responsibility |
+|---|---|---|---|
+| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor primitives, ops, allocators, storage, and device abstraction |
+| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF parsing, tokenizers, prompt rendering, sampling, and output protocol parsing |
+| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`, architecture implementations, multimodal encoders, and model-side execution helpers |
+| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML-backed execution and native interop |
+| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core server, OpenAI/Ollama adapters, queueing, and web UI |
+| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | Console host and debugging / batch tooling |
+
+This split keeps engine users off the web stack, keeps API-layer changes from leaking into core/runtime packages, and makes future benchmark or eval-harness projects easier to publish independently.
+
## Prerequisites
- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
@@ -121,10 +125,10 @@ dotnet build TensorSharp.slnx
```bash
# Console application
-dotnet build InferenceConsole/InferenceConsole.csproj
+dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj
# Web application
-dotnet build InferenceWeb/InferenceWeb.csproj
+dotnet build TensorSharp.Server/TensorSharp.Server.csproj
```
### Build the native GGML library
@@ -166,7 +170,7 @@ TENSORSHARP_GGML_NATIVE_BUILD_PARALLEL_LEVEL=2 bash build-linux.sh --cuda
You can also request a CUDA-enabled native build from `dotnet build`:
```bash
-TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
+TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release
```
On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory.
@@ -176,38 +180,38 @@ On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `bui
### Console Application
```bash
-cd InferenceConsole/bin
+cd TensorSharp.Cli/bin
# Text inference
-./InferenceConsole --model --input prompt.txt --output result.txt \
+./TensorSharp.Cli --model --input prompt.txt --output result.txt \
--max-tokens 200 --backend ggml_metal
# Text inference on Linux + NVIDIA GPU
-./InferenceConsole --model --input prompt.txt --output result.txt \
+./TensorSharp.Cli --model --input prompt.txt --output result.txt \
--max-tokens 200 --backend ggml_cuda
# Image inference (Gemma 3/4, Qwen 3.5)
-./InferenceConsole --model --image photo.png --backend ggml_metal
+./TensorSharp.Cli --model --image photo.png --backend ggml_metal
# Video inference (Gemma 4)
-./InferenceConsole --model --video clip.mp4 --backend ggml_metal
+./TensorSharp.Cli --model --video clip.mp4 --backend ggml_metal
# Audio inference (Gemma 4)
-./InferenceConsole --model --audio speech.wav --backend ggml_metal
+./TensorSharp.Cli --model --audio speech.wav --backend ggml_metal
# Thinking / reasoning mode
-./InferenceConsole --model --input prompt.txt --backend ggml_metal --think
+./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal --think
# Tool calling
-./InferenceConsole --model --input prompt.txt --backend ggml_metal \
+./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \
--tools tools.json
# With sampling parameters
-./InferenceConsole --model --input prompt.txt --backend ggml_metal \
+./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \
--temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
# Batch processing (JSONL)
-./InferenceConsole --model --input-jsonl requests.jsonl \
+./TensorSharp.Cli --model --input-jsonl requests.jsonl \
--output results.txt --backend ggml_metal
```
@@ -253,13 +257,13 @@ Each line is a JSON object with `messages`, optional `prompt`, and optional samp
### Web Application
```bash
-cd InferenceWeb/bin
+cd TensorSharp.Server/bin
# Set environment variables and run
-MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb
+MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server
# Linux + NVIDIA GPU
-MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
+MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server
```
Open `http://localhost:5000` in your browser. The web interface supports:
@@ -284,7 +288,7 @@ Open `http://localhost:5000` in your browser. The web interface supports:
### HTTP APIs
-InferenceWeb exposes three API styles. See [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md) for full documentation with curl and Python examples.
+TensorSharp.Server exposes three API styles. See [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md) for full documentation with curl and Python examples.
**Ollama-compatible API:**
@@ -403,17 +407,27 @@ Gemma 4 models support image, video, and audio inputs. Place the multimodal proj
These models support image inputs with their respective multimodal projector files.
+### Mistral 3
+
+Mistral 3 supports image inputs via the Pixtral vision encoder. Place the multimodal projector (`mistral3-mmproj.gguf`) in the same directory as the model file for automatic loading.
+
+- **Images:** PNG, JPEG
+
## Architecture
TensorSharp is structured as a layered system:
-1. **TensorSharp** provides the core `Tensor` type, storage abstraction, and an extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
+1. **TensorSharp.Core** provides the core `Tensor` type, storage abstraction, and the extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
-2. **TensorSharp.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
+2. **TensorSharp.Runtime** owns runtime-facing contracts and services: GGUF parsing, tokenization (SentencePiece / BPE), chat template rendering, configurable token sampling, output parsing, and reusable contracts such as `IModelArchitecture`, `IPromptRenderer`, `IOutputProtocolParser`, `IMultimodalInjector`, `IKVCachePolicy`, and `IBackendExecutionPlan`.
-3. **InferenceEngine** implements model-specific logic: GGUF parsing, tokenization (SentencePiece BPE), chat template rendering (Jinja2 from GGUF metadata with hardcoded fallbacks), configurable token sampling, output parsing (thinking extraction, tool-call extraction), and the forward pass for each architecture (including hybrid SSM-Transformer models like Nemotron-H with Mamba2 layers). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
+3. **TensorSharp.Models** implements `ModelBase` plus the concrete architectures and multimodal helpers (Gemma 3/4, Qwen 3/3.5, GPT OSS, Nemotron-H, Mistral 3). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
-4. **InferenceConsole** and **InferenceWeb** are application layers that handle I/O and user interaction. InferenceWeb provides Ollama-compatible and OpenAI-compatible REST APIs alongside a browser-based chat UI, with a FIFO inference queue to serialize concurrent requests.
+4. **TensorSharp.Backends.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
+
+5. **TensorSharp.Server** is the HTTP/application layer. It provides Ollama-compatible and OpenAI-compatible REST APIs, the browser-based chat UI, upload handling, and the FIFO inference queue.
+
+6. **TensorSharp.Cli** is the console/application layer for local prompts, multimodal experiments, prompt inspection, and JSONL batch workflows.
### Performance Optimizations
@@ -426,16 +440,16 @@ TensorSharp is structured as a layered system:
## Testing
-Integration tests for InferenceWeb are in `InferenceWeb/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
+Integration tests for TensorSharp.Server are in `TensorSharp.Server/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
```bash
-# Start InferenceWeb, then run:
-python3 InferenceWeb/testdata/test_multiturn.py
+# Start TensorSharp.Server, then run:
+python3 TensorSharp.Server/testdata/test_multiturn.py
# or
-bash InferenceWeb/testdata/test_multiturn.sh
+bash TensorSharp.Server/testdata/test_multiturn.sh
```
-See [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md) for the full test matrix.
+See [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md) for the full test matrix.
## Author
@@ -444,3 +458,4 @@ Zhongkai Fu
## License
See [LICENSE](LICENSE) for details.
+
diff --git a/README_zh-cn.md b/README_zh-cn.md
index fdbf14e..9fb6ce8 100644
--- a/README_zh-cn.md
+++ b/README_zh-cn.md
@@ -1,411 +1,426 @@
-# TensorSharp
-
-
-
-
-
-[English](README.md) | [中文](README_zh-cn.md)
-
-一个用于在本地运行大型语言模型(LLM)的 C# 推理引擎,使用 GGUF 模型文件。TensorSharp 提供控制台应用、基于 Web 的聊天界面,以及兼容 Ollama/OpenAI 的 HTTP API 以便程序化调用。
-
-## 功能特性
-
-- **多架构支持** —— Gemma 4、Gemma 3、Qwen 3、Qwen 3.5、GPT OSS、Nemotron-H
-- **多模态推理** —— 图像、视频和音频输入(Gemma 4);图像输入(Gemma 3 / Qwen 3.5)
-- **思维链 / 推理模式** —— 通过 `` / `<|channel>thought` / `<|channel>analysis` 标签输出结构化的思维链推理(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H)
-- **工具调用 / 函数调用** —— 模型可调用用户定义的工具;所有三种 API 风格均支持多轮工具调用对话
-- **量化模型支持** —— 加载 Q4_K_M、Q8_0、F16、MXFP4 等量化格式的 GGUF 文件;执行原生量化矩阵乘法(matmul),无需反量化到 FP32,并且纯 C# CPU 后端在加载大型 GGUF 时也会保持量化权重压缩状态
-- **GPU 加速** —— 通过 GGML 支持 Apple Metal(macOS)和 GGML CUDA(Linux/NVIDIA);Gemma 4 在 Metal 上支持整模型融合 GPU decode(相对逐算子调度约提升 2.6 倍)
-- **优化后的纯 C# CPU 后端** —— 为 GEMM、RMSNorm、RoPE、softmax、融合激活等推理热点路径提供托管快速路径和 SIMD 内核
-- **兼容 Ollama 与 OpenAI API** —— 可作为现有工具链的即插即用替代端点
-- **可配置采样** —— temperature、top-k、top-p、min-p、重复/存在/频率惩罚、seed、停止序列
-- **聊天模板** —— 从 GGUF 元数据自动加载(Jinja2),并为不同架构提供硬编码回退模板
-- **请求队列** —— FIFO 推理队列确保单请求执行以保障 KV 缓存稳定性,并为客户端提供实时排队位置反馈
-- **批处理** —— 控制台应用支持 JSONL 输入
-- **流式输出** —— 按 token 输出(Web 通过 SSE,控制台通过 stdout)
-- **混合 SSM-Transformer** —— Nemotron-H 在单个模型中混合 Mamba2 SSM 层、纯注意力层和 MoE FFN 层
-- **专家混合(MoE)** —— 支持 Gemma 4 MoE 变体(例如 gemma-4-26B-A4B)、GPT OSS MoE(例如 gpt-oss-20b)、Nemotron-H MoE FFN 层
-- **消息编辑** —— 在 Web 聊天界面中编辑或删除历史消息,并从该位置重新生成回复
-- **大文件上传** —— Web 界面支持最大 500 MB 的视频/音频上传
-
-## 支持的模型架构
-
-| 架构 | 示例模型 | 多模态 | 思维链 | 工具调用 |
-|---|---|---|---|---|
-| Gemma 4 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B(MoE) | 图像、视频、音频 | 支持 | 支持 |
-| Gemma 3 | gemma-3-4b | 图像 | 不支持 | 不支持 |
-| Qwen 3 | Qwen3-4B | 仅文本 | 支持 | 支持 |
-| Qwen 3.5 | Qwen3.5-9B、Qwen3.5-35B-A3B | 图像 | 支持 | 支持 |
-| GPT OSS | gpt-oss-20b(MoE) | 仅文本 | 支持 | 不支持 |
-| Nemotron-H | Nemotron-H-8B、Nemotron-H-47B(混合 SSM-Transformer,MoE) | 仅文本 | 支持 | 支持 |
-
-各架构的详细文档见[模型架构卡片](docs/model_cards_cn.md)。
-
-## 模型下载(GGUF)
-
-TensorSharp 使用 GGUF 格式模型文件。以下是各架构对应的 Hugging Face 下载链接。请根据硬件条件选择合适的量化版本(Q4_K_M 适合低内存,Q8_0 适合更高质量等)。
-
-| 架构 | 模型 | GGUF 下载 |
-|---|---|---|
-| Gemma 4 | gemma-4-E4B-it | [ggml-org/gemma-4-E4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF) |
-| Gemma 4 | gemma-4-31B-it | [ggml-org/gemma-4-31B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-31B-it-GGUF) |
-| Gemma 4 | gemma-4-26B-A4B-it(MoE) | [ggml-org/gemma-4-26B-A4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-26B-A4B-it-GGUF) |
-| Gemma 4 | gemma-4-mmproj(多模态投影器) | 包含在上述 GGUF 仓库中 |
-| Gemma 3 | gemma-3-4b-it | [google/gemma-3-4b-it-qat-q4_0-gguf](https://huggingface.co/google/gemma-3-4b-it-qat-q4_0-gguf) |
-| Qwen 3 | Qwen3-4B | [Qwen/Qwen3-4B-GGUF](https://huggingface.co/Qwen/Qwen3-4B-GGUF) |
-| Qwen 3.5 | Qwen3.5-9B | [unsloth/Qwen3.5-9B-GGUF](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) |
-| Qwen 3.5 | Qwen3.5-35B-A3B | [ggml-org/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/ggml-org/Qwen3.5-35B-A3B-GGUF) |
-| GPT OSS | gpt-oss-20b(MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
-| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
-| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
-
-## 计算后端
-
-| 后端 | 参数 | 说明 |
-|---|---|---|
-| GGML Metal | `--backend ggml_metal` | 通过 Apple Metal(macOS)进行 GPU 加速。推荐用于 Apple Silicon。 |
-| GGML CUDA | `--backend ggml_cuda` | 通过 GGML CUDA 在 Linux + NVIDIA GPU 上进行加速。 |
-| GGML CPU | `--backend ggml_cpu` | 使用原生 GGML 与优化内核进行 CPU 推理。 |
-| 纯 C# CPU | `--backend cpu` | 无原生依赖的可移植 CPU 推理。 |
-
-## 项目结构
-
-```text
-TensorSharp/
-├── TensorSharp/ # 核心张量库(CPU 运算、SIMD)
-├── TensorSharp.GGML/ # GGML 后端绑定(通过原生库支持 Metal/CUDA/CPU)
-├── TensorSharp.GGML.Native/ # 到 ggml 的原生 C++ 桥接(构建 libGgmlOps)
-├── AdvUtils/ # 工具库
-├── InferenceEngine/ # 模型加载、分词和推理逻辑
-│ ├── Models/
-│ │ ├── Gemma3/
-│ │ ├── Gemma4/ # 视觉编码器、音频编码器、MoE、融合 GPU decode
-│ │ ├── GptOss/ # MoE、注意力沉降、SiLUAlphaLimit、Yarn RoPE
-│ │ ├── Nemotron/ # 混合 Mamba2 SSM + 注意力 + MoE FFN
-│ │ ├── Qwen3/
-│ │ └── Qwen35/
-│ ├── GgufReader.cs # GGUF 文件解析器
-│ ├── ModelBase.cs # 各模型架构基类
-│ ├── ChatTemplate.cs # 聊天模板渲染(硬编码 + 来自 GGUF 的 Jinja2)
-│ ├── Jinja2Template.cs # Jinja2 模板渲染器
-│ ├── OutputParser.cs # 从模型输出中提取思维链、内容和工具调用
-│ ├── SamplingConfig.cs # 采样参数配置
-│ ├── TokenSampler.cs # Token 采样(greedy、top-k、top-p、min-p、惩罚项)
-│ └── MediaHelper.cs # 视频抽帧、音频解码
-├── InferenceConsole/ # CLI 应用
-├── InferenceWeb/ # Web 聊天 + API 服务(ASP.NET Core)
-│ ├── ModelService.cs # 模型生命周期管理
-│ ├── InferenceQueue.cs # 带排队位置追踪的 FIFO 请求队列
-│ ├── wwwroot/index.html # 聊天界面
-│ ├── testdata/ # 集成测试套件(bash + Python)
-│ └── API_EXAMPLES.md # 详细 API 文档
-└── ExternalProjects/ # 第三方依赖(ggml)
-```
-
-## 前置要求
-
-- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
-- **macOS(Metal 后端):** 用于构建原生 GGML 库的 CMake 3.20+ 与 Xcode 命令行工具
-- **Linux(GGML CPU / CUDA 后端):** CMake 3.20+;若使用 `ggml_cuda`,还需要 NVIDIA 驱动和 CUDA Toolkit 12.x 或其他兼容版本
-- GGUF 模型文件(例如来自 [Hugging Face](https://huggingface.co))
-
-## 构建
-
-### 构建整个解决方案
-
-```bash
-dotnet build TensorSharp.slnx
-```
-
-### 构建单独应用
-
-```bash
-# 控制台应用
-dotnet build InferenceConsole/InferenceConsole.csproj
-
-# Web 应用
-dotnet build InferenceWeb/InferenceWeb.csproj
-```
-
-### 构建原生 GGML 库
-
-如果原生库不存在,首次执行 `dotnet build` 时会自动构建。也可以手动构建:
-
-```bash
-cd TensorSharp.GGML.Native
-```
-
-macOS:
-
-```bash
-bash build-macos.sh
-```
-
-Linux(仅 CPU):
-
-```bash
-bash build-linux.sh
-```
-
-Linux(启用 GGML_CUDA):
-
-```bash
-bash build-linux.sh --cuda
-```
-
-也可以在 `dotnet build` 时通过环境变量请求 CUDA 版本的原生库:
-
-```bash
-TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
-```
-
-在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上,`build-linux.sh` 会保留已有的 CUDA 构建,并在检测到 CUDA 工具链时自动启用 GGML_CUDA;也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。
-
-## 使用方法
-
-### 控制台应用
-
-```bash
-cd InferenceConsole/bin
-
-# 文本推理
-./InferenceConsole --model --input prompt.txt --output result.txt \
- --max-tokens 200 --backend ggml_metal
-
-# Linux + NVIDIA GPU 文本推理
-./InferenceConsole --model --input prompt.txt --output result.txt \
- --max-tokens 200 --backend ggml_cuda
-
-# 图像推理(Gemma 3/4,Qwen 3.5)
-./InferenceConsole --model --image photo.png --backend ggml_metal
-
-# 视频推理(Gemma 4)
-./InferenceConsole --model --video clip.mp4 --backend ggml_metal
-
-# 音频推理(Gemma 4)
-./InferenceConsole --model --audio speech.wav --backend ggml_metal
-
-# 思维链 / 推理模式
-./InferenceConsole --model --input prompt.txt --backend ggml_metal --think
-
-# 工具调用
-./InferenceConsole --model --input prompt.txt --backend ggml_metal \
- --tools tools.json
-
-# 使用采样参数
-./InferenceConsole --model --input prompt.txt --backend ggml_metal \
- --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
-
-# 批处理(JSONL)
-./InferenceConsole --model --input-jsonl requests.jsonl \
- --output results.txt --backend ggml_metal
-```
-
-**命令行参数:**
-
-| 参数 | 说明 |
-|---|---|
-| `--model ` | GGUF 模型文件路径(必填) |
-| `--input ` | 包含用户提示词的文本文件 |
-| `--input-jsonl ` | JSONL 批量请求文件(每行一个 JSON) |
-| `--multi-turn-jsonl ` | 用于多轮对话模拟(含 KV 缓存复用)的 JSONL 文件 |
-| `--output ` | 将生成文本写入该文件 |
-| `--image ` | 用于视觉推理的图像文件 |
-| `--video ` | 用于视频推理的视频文件 |
-| `--audio ` | 音频文件(WAV、MP3、OGG)用于音频推理 |
-| `--mmproj ` | 多模态投影器 GGUF 文件路径 |
-| `--max-tokens ` | 最大生成 token 数(默认:100) |
-| `--backend ` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda` |
-| `--think` | 启用思维链/推理模式 |
-| `--tools ` | 包含工具/函数定义的 JSON 文件 |
-| `--temperature ` | 采样温度(0 = 贪心) |
-| `--top-k ` | Top-K 过滤(0 = 关闭) |
-| `--top-p ` | Nucleus 采样阈值(1.0 = 关闭) |
-| `--min-p ` | 最小概率过滤(0 = 关闭) |
-| `--repeat-penalty ` | 重复惩罚(1.0 = 无) |
-| `--presence-penalty ` | 存在惩罚(0 = 关闭) |
-| `--frequency-penalty ` | 频率惩罚(0 = 关闭) |
-| `--seed ` | 随机种子(-1 = 非确定性) |
-| `--stop ` | 停止序列(可重复指定) |
-| `--test` | 运行内置测试套件 |
-
-如果把多模态投影器文件放在模型文件同目录并使用可识别命名(例如 `gemma-4-mmproj-F16.gguf`),系统会自动检测。
-
-**JSONL 输入格式:**
-
-每行是一个 JSON 对象,包含 `messages`、可选 `prompt` 和可选采样参数:
-
-```json
-{"id": "q1", "messages": [{"role": "user", "content": "What is 2+3?"}], "max_tokens": 50}
-{"id": "q2", "messages": [{"role": "user", "content": "Write a haiku."}], "max_tokens": 100, "temperature": 0.8}
-```
-
-### Web 应用
-
-```bash
-cd InferenceWeb/bin
-
-# 设置环境变量并运行
-MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb
-
-# Linux + NVIDIA GPU
-MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
-```
-
-在浏览器中打开 `http://localhost:5000`。Web 界面支持:
-
-- 多轮聊天
-- 从 `MODEL_DIR` 中可用 GGUF 文件列表选择模型
-- 上传图像、视频和音频进行多模态推理(最大 500 MB)
-- 思维链/推理模式切换
-- 带函数定义的工具调用
-- 通过 Server-Sent Events 进行流式 token 生成
-- 带实时排队位置反馈的请求队列
-- 消息编辑和删除,支持从对话中任意位置重新生成
-
-**环境变量:**
-
-| 变量 | 说明 |
-|---|---|
-| `MODEL_DIR` | GGUF 模型文件所在目录 |
-| `BACKEND` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`(默认:macOS 为 `ggml_metal`,其他平台为 `ggml_cpu`) |
-| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限(默认:`4`) |
-| `PORT` | HTTP 端口(默认:`5000`) |
-
-### HTTP API
-
-InferenceWeb 暴露三种 API 风格。完整文档及 curl/Python 示例见 [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md)。
-
-**兼容 Ollama 的 API:**
-
-```bash
-# 列出模型
-curl http://localhost:5000/api/tags
-
-# 文本生成
-curl -X POST http://localhost:5000/api/generate \
- -H "Content-Type: application/json" \
- -d '{"model": "Qwen3-4B-Q8_0.gguf", "prompt": "Hello!", "stream": false}'
-
-# 聊天
-curl -X POST http://localhost:5000/api/chat/ollama \
- -H "Content-Type: application/json" \
- -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "stream": false}'
-
-# 启用思维链模式的聊天
-curl -X POST http://localhost:5000/api/chat/ollama \
- -H "Content-Type: application/json" \
- -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "计算 17*23"}], "think": true, "stream": false}'
-
-# 带工具调用的聊天
-curl -X POST http://localhost:5000/api/chat/ollama \
- -H "Content-Type: application/json" \
- -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "天气怎么样?"}], "tools": [{"function": {"name": "get_weather", "description": "获取当前天气", "parameters": {"properties": {"city": {"type": "string"}}, "required": ["city"]}}}], "stream": false}'
-```
-
-**兼容 OpenAI 的 API:**
-
-```bash
-# Chat completions
-curl -X POST http://localhost:5000/v1/chat/completions \
- -H "Content-Type: application/json" \
- -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 50}'
-```
-
-**OpenAI Python SDK:**
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:5000/v1", api_key="not-needed")
-response = client.chat.completions.create(
- model="Qwen3-4B-Q8_0.gguf",
- messages=[{"role": "user", "content": "What is 2+3?"}],
- max_tokens=50
-)
-print(response.choices[0].message.content)
-```
-
-**队列状态:**
-
-```bash
-curl http://localhost:5000/api/queue/status
-# {"busy":false,"pending_requests":0,"total_processed":42}
-```
-
-## 思维链 / 推理模式
-
-支持思维链模式的模型(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H)可以在生成最终答案之前产出结构化的思维链推理内容。思维内容与主要回复分开,客户端可选择显示或隐藏。
-
-- **Qwen 3 / Qwen 3.5 / Nemotron-H:** 使用 `...` 标签
-- **Gemma 4:** 使用 `<|channel>thought\n...` 标签
-- **GPT OSS:** 使用 Harmony 格式,以 `<|channel|>analysis` 标记思维过程,以 `<|channel|>final` 标记最终回复
-
-通过 `--think`(控制台)、`"think": true`(Ollama API)或 Web 界面中的思维链开关启用。
-
-## 工具调用 / 函数调用
-
-模型可以调用用户定义的工具并参与多轮工具调用对话。将工具定义为 JSON 格式,通过 `--tools`(控制台)或 API 中的 `tools` 参数传入。
-
-各架构使用各自的工具调用格式:
-
-- **Qwen 3 / Qwen 3.5 / Nemotron-H:** `{"name": "...", "arguments": {...}}`
-- **Gemma 4:** `<|tool_call>call:function_name{args}`
-
-输出解析器(`OutputParser.cs`)会自动从模型原始输出中提取工具调用,与架构无关。
-
-## 多模态支持
-
-### Gemma 4
-
-Gemma 4 模型支持图像、视频和音频输入。将多模态投影器(`gemma-4-mmproj-F16.gguf`)放在与模型文件相同目录即可自动加载。
-
-- **图像:** PNG、JPEG
-- **视频:** MP4(使用 OpenCV 以 1 fps 抽取最多 8 帧)
-- **音频:** WAV(16kHz 单声道)、MP3、OGG Vorbis
-
-### Gemma 3 / Qwen 3.5
-
-这两类模型支持图像输入,并需要对应的多模态投影器文件。
-
-## 架构说明
-
-TensorSharp 采用分层系统结构:
-
-1. **TensorSharp** 提供核心 `Tensor` 类型、存储抽象和可扩展的操作注册表(`Ops`)。CPU 实现使用 `System.Numerics.Vectors` 进行 SIMD 加速。
-
-2. **TensorSharp.GGML** 通过原生 C++ 桥接库(`libGgmlOps`)注册同名操作的加速实现,并链接 [ggml](https://github.com/ggml-org/ggml)。在 macOS 上可提供 Metal GPU 计算,在 Linux 上可启用面向 NVIDIA GPU 的 GGML CUDA。操作包括原生量化 matmul(Q4_K_M、Q8_0 等),无需反量化到 FP32。
-
-3. **InferenceEngine** 实现模型相关逻辑:GGUF 解析、分词(SentencePiece BPE)、聊天模板渲染(来自 GGUF 元数据的 Jinja2 + 硬编码回退)、可配置 token 采样、输出解析(思维链提取、工具调用提取),以及各架构前向计算(包括 Nemotron-H 等混合 SSM-Transformer 模型的 Mamba2 层)。模型通过 `ModelBase.Create()` 加载,并依据 GGUF 元数据自动识别架构。
-
-4. **InferenceConsole** 与 **InferenceWeb** 是应用层,负责 I/O 和用户交互。InferenceWeb 同时提供兼容 Ollama 与 OpenAI 的 REST API 以及浏览器聊天 UI,并使用 FIFO 推理队列来串行化并发请求。
-
-### 性能优化
-
-- **融合 GPU decode**(Gemma 4):在 Metal 上将所有 Transformer 层合并为单次 GGML 计算图调度,将每个 token 的 CPU-GPU 往返从数百次降低到一次。相较逐算子调度约提升 2.6 倍。
-- **融合权重投影**:Q/K/V 投影融合为单次 QKV matmul;gate 与 up 投影融合为单次 gate_up matmul。
-- **原生量化计算**:量化权重(Q4_K_M、Q6_K、Q8_0 等)直接参与 matmul,无需展开为 FP32,节省内存与带宽。
-- **优化后的纯 C# CPU 路径**:托管 GEMM 快速路径和连续 Float32 内核加速了 decode、softmax、RMSNorm、RoPE、融合激活等热点路径,同时在 CPU 加载时保持量化 GGUF 权重压缩状态。
-- **环形 KV 缓存**:滑动窗口注意力层使用固定大小环形缓冲区,使内存占用不随序列长度增长。
-- **高内存效率模型加载**:大张量直接流式加载到原生内存,避免中间托管内存分配。
-
-## 测试
-
-InferenceWeb 的集成测试位于 `InferenceWeb/testdata/`。测试覆盖所有三种 API 风格(Web UI SSE、Ollama、OpenAI)、多轮对话、思维链模式、工具调用、队列行为、并发请求和中断支持。
-
-```bash
-# 先启动 InferenceWeb,然后运行:
-python3 InferenceWeb/testdata/test_multiturn.py
-# 或
-bash InferenceWeb/testdata/test_multiturn.sh
-```
-
-完整测试矩阵见 [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md)。
-
-## 作者
-
-Zhongkai Fu
-
-## 许可证
-
-详见 [LICENSE](LICENSE)。
+# TensorSharp
+
+
+
+
+
+[English](README.md) | [中文](README_zh-cn.md)
+
+一个用于在本地运行大型语言模型(LLM)的 C# 推理引擎,使用 GGUF 模型文件。TensorSharp 提供控制台应用、基于 Web 的聊天界面,以及兼容 Ollama/OpenAI 的 HTTP API 以便程序化调用。
+
+## 功能特性
+
+- **多架构支持** —— Gemma 4、Gemma 3、Qwen 3、Qwen 3.5、GPT OSS、Nemotron-H、Mistral 3
+- **多模态推理** —— 图像、视频和音频输入(Gemma 4);图像输入(Gemma 3 / Qwen 3.5 / Mistral 3)
+- **思维链 / 推理模式** —— 通过 `` / `<|channel>thought` / `<|channel>analysis` 标签输出结构化的思维链推理(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H)
+- **工具调用 / 函数调用** —— 模型可调用用户定义的工具;所有三种 API 风格均支持多轮工具调用对话
+- **量化模型支持** —— 加载 Q4_K_M、Q8_0、F16、MXFP4 等量化格式的 GGUF 文件;执行原生量化矩阵乘法(matmul),无需反量化到 FP32,并且纯 C# CPU 后端在加载大型 GGUF 时也会保持量化权重压缩状态
+- **GPU 加速** —— 通过 GGML 支持 Apple Metal(macOS)和 GGML CUDA(Linux/NVIDIA);Gemma 4 在 Metal 上支持整模型融合 GPU decode(相对逐算子调度约提升 2.6 倍)
+- **优化后的纯 C# CPU 后端** —— 为 GEMM、RMSNorm、RoPE、softmax、融合激活等推理热点路径提供托管快速路径和 SIMD 内核
+- **兼容 Ollama 与 OpenAI API** —— 可作为现有工具链的即插即用替代端点
+- **可配置采样** —— temperature、top-k、top-p、min-p、重复/存在/频率惩罚、seed、停止序列
+- **聊天模板** —— 从 GGUF 元数据自动加载(Jinja2),并为不同架构提供硬编码回退模板
+- **请求队列** —— FIFO 推理队列确保单请求执行以保障 KV 缓存稳定性,并为客户端提供实时排队位置反馈
+- **批处理** —— 控制台应用支持 JSONL 输入
+- **流式输出** —— 按 token 输出(Web 通过 SSE,控制台通过 stdout)
+- **混合 SSM-Transformer** —— Nemotron-H 在单个模型中混合 Mamba2 SSM 层、纯注意力层和 MoE FFN 层
+- **专家混合(MoE)** —— 支持 Gemma 4 MoE 变体(例如 gemma-4-26B-A4B)、GPT OSS MoE(例如 gpt-oss-20b)、Nemotron-H MoE FFN 层
+- **消息编辑** —— 在 Web 聊天界面中编辑或删除历史消息,并从该位置重新生成回复
+- **大文件上传** —— Web 界面支持最大 500 MB 的视频/音频上传
+
+## 支持的模型架构
+
+| 架构 | 示例模型 | 多模态 | 思维链 | 工具调用 |
+|---|---|---|---|---|
+| Gemma 4 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B(MoE) | 图像、视频、音频 | 支持 | 支持 |
+| Gemma 3 | gemma-3-4b | 图像 | 不支持 | 不支持 |
+| Qwen 3 | Qwen3-4B | 仅文本 | 支持 | 支持 |
+| Qwen 3.5 | Qwen3.5-9B、Qwen3.5-35B-A3B | 图像 | 支持 | 支持 |
+| GPT OSS | gpt-oss-20b(MoE) | 仅文本 | 支持 | 不支持 |
+| Nemotron-H | Nemotron-H-8B、Nemotron-H-47B(混合 SSM-Transformer,MoE) | 仅文本 | 支持 | 支持 |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | 图像 | 不支持 | 不支持 |
+
+各架构的详细文档见[模型架构卡片](docs/model_cards_cn.md)。
+
+## 模型下载(GGUF)
+
+TensorSharp 使用 GGUF 格式模型文件。以下是各架构对应的 Hugging Face 下载链接。请根据硬件条件选择合适的量化版本(Q4_K_M 适合低内存,Q8_0 适合更高质量等)。
+
+| 架构 | 模型 | GGUF 下载 |
+|---|---|---|
+| Gemma 4 | gemma-4-E4B-it | [ggml-org/gemma-4-E4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF) |
+| Gemma 4 | gemma-4-31B-it | [ggml-org/gemma-4-31B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-31B-it-GGUF) |
+| Gemma 4 | gemma-4-26B-A4B-it(MoE) | [ggml-org/gemma-4-26B-A4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-26B-A4B-it-GGUF) |
+| Gemma 4 | gemma-4-mmproj(多模态投影器) | 包含在上述 GGUF 仓库中 |
+| Gemma 3 | gemma-3-4b-it | [google/gemma-3-4b-it-qat-q4_0-gguf](https://huggingface.co/google/gemma-3-4b-it-qat-q4_0-gguf) |
+| Qwen 3 | Qwen3-4B | [Qwen/Qwen3-4B-GGUF](https://huggingface.co/Qwen/Qwen3-4B-GGUF) |
+| Qwen 3.5 | Qwen3.5-9B | [unsloth/Qwen3.5-9B-GGUF](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) |
+| Qwen 3.5 | Qwen3.5-35B-A3B | [ggml-org/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/ggml-org/Qwen3.5-35B-A3B-GGUF) |
+| GPT OSS | gpt-oss-20b(MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
+| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
+| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+| Mistral 3 | mistral3-mmproj(Pixtral 视觉投影器) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+
+## 计算后端
+
+| 后端 | 参数 | 说明 |
+|---|---|---|
+| GGML Metal | `--backend ggml_metal` | 通过 Apple Metal(macOS)进行 GPU 加速。推荐用于 Apple Silicon。 |
+| GGML CUDA | `--backend ggml_cuda` | 通过 GGML CUDA 在 Linux + NVIDIA GPU 上进行加速。 |
+| GGML CPU | `--backend ggml_cpu` | 使用原生 GGML 与优化内核进行 CPU 推理。 |
+| 纯 C# CPU | `--backend cpu` | 无原生依赖的可移植 CPU 推理。 |
+
+## 项目结构
+
+```text
+TensorSharp/
+├── TensorSharp.Core/ # 核心张量库(Tensor、Ops、内存、设备抽象)
+├── TensorSharp.Runtime/ # GGUF、分词器、模板、采样、协议解析
+├── TensorSharp.Models/ # 模型架构实现与多模态编码/注入
+├── TensorSharp.Backends.GGML/ # GGML 后端绑定(通过原生库支持 Metal/CUDA/CPU)
+├── TensorSharp.GGML.Native/ # 到 ggml 的原生 C++ 桥接(构建 libGgmlOps)
+├── TensorSharp.Server/ # Web 聊天 + API 服务(ASP.NET Core)
+│ ├── ModelService.cs # 模型生命周期管理
+│ ├── InferenceQueue.cs # 带排队位置跟踪的 FIFO 请求队列
+│ ├── wwwroot/index.html # 聊天界面
+│ ├── testdata/ # 集成测试套件(bash + Python)
+│ └── API_EXAMPLES.md # 详细 API 文档
+├── TensorSharp.Cli/ # CLI 应用
+├── AdvUtils/ # 工具库
+└── ExternalProjects/ # 第三方依赖(ggml)
+```
+
+## NuGet 包分层
+
+现在仓库按包边界拆成独立层,使用者可以只引用真正需要的部分。
+
+| 项目 | NuGet 包 | 对外 namespace | 职责 |
+|---|---|---|---|
+| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor 原语、Ops、分配器、存储与设备抽象 |
+| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF 解析、分词器、Prompt 渲染、采样与输出协议解析 |
+| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`、各模型架构、多模态编码器与模型侧执行辅助 |
+| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML 执行后端与原生互操作 |
+| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core 服务、OpenAI/Ollama 适配层、队列与 Web UI |
+| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | 控制台宿主、调试工具与 JSONL 批处理 |
+
+这样的拆分让引擎使用者不必带上 Web 依赖,也能把 API 层改动和核心运行时隔离开,并让后续 benchmark / eval harness 更容易独立发布。
+
+## 前置要求
+
+- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
+- **macOS(Metal 后端):** 用于构建原生 GGML 库的 CMake 3.20+ 与 Xcode 命令行工具
+- **Linux(GGML CPU / CUDA 后端):** CMake 3.20+;若使用 `ggml_cuda`,还需要 NVIDIA 驱动和 CUDA Toolkit 12.x 或其他兼容版本
+- GGUF 模型文件(例如来自 [Hugging Face](https://huggingface.co))
+
+## 构建
+
+### 构建整个解决方案
+
+```bash
+dotnet build TensorSharp.slnx
+```
+
+### 构建单独应用
+
+```bash
+# 控制台应用
+dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj
+
+# Web 应用
+dotnet build TensorSharp.Server/TensorSharp.Server.csproj
+```
+
+### 构建原生 GGML 库
+
+如果原生库不存在,首次执行 `dotnet build` 时会自动构建。也可以手动构建:
+
+```bash
+cd TensorSharp.GGML.Native
+```
+
+macOS:
+
+```bash
+bash build-macos.sh
+```
+
+Linux(仅 CPU):
+
+```bash
+bash build-linux.sh
+```
+
+Linux(启用 GGML_CUDA):
+
+```bash
+bash build-linux.sh --cuda
+```
+
+也可以在 `dotnet build` 时通过环境变量请求 CUDA 版本的原生库:
+
+```bash
+TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release
+```
+
+在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上,`build-linux.sh` 会保留已有的 CUDA 构建,并在检测到 CUDA 工具链时自动启用 GGML_CUDA;也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。
+
+## 使用方法
+
+### 控制台应用
+
+```bash
+cd TensorSharp.Cli/bin
+
+# 文本推理
+./TensorSharp.Cli --model --input prompt.txt --output result.txt \
+ --max-tokens 200 --backend ggml_metal
+
+# Linux + NVIDIA GPU 文本推理
+./TensorSharp.Cli --model --input prompt.txt --output result.txt \
+ --max-tokens 200 --backend ggml_cuda
+
+# 图像推理(Gemma 3/4,Qwen 3.5)
+./TensorSharp.Cli --model --image photo.png --backend ggml_metal
+
+# 视频推理(Gemma 4)
+./TensorSharp.Cli --model --video clip.mp4 --backend ggml_metal
+
+# 音频推理(Gemma 4)
+./TensorSharp.Cli --model --audio speech.wav --backend ggml_metal
+
+# 思维链 / 推理模式
+./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal --think
+
+# 工具调用
+./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \
+ --tools tools.json
+
+# 使用采样参数
+./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \
+ --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
+
+# 批处理(JSONL)
+./TensorSharp.Cli --model --input-jsonl requests.jsonl \
+ --output results.txt --backend ggml_metal
+```
+
+**命令行参数:**
+
+| 参数 | 说明 |
+|---|---|
+| `--model ` | GGUF 模型文件路径(必填) |
+| `--input ` | 包含用户提示词的文本文件 |
+| `--input-jsonl ` | JSONL 批量请求文件(每行一个 JSON) |
+| `--multi-turn-jsonl ` | 用于多轮对话模拟(含 KV 缓存复用)的 JSONL 文件 |
+| `--output ` | 将生成文本写入该文件 |
+| `--image ` | 用于视觉推理的图像文件 |
+| `--video ` | 用于视频推理的视频文件 |
+| `--audio ` | 音频文件(WAV、MP3、OGG)用于音频推理 |
+| `--mmproj ` | 多模态投影器 GGUF 文件路径 |
+| `--max-tokens ` | 最大生成 token 数(默认:100) |
+| `--backend ` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda` |
+| `--think` | 启用思维链/推理模式 |
+| `--tools ` | 包含工具/函数定义的 JSON 文件 |
+| `--temperature ` | 采样温度(0 = 贪心) |
+| `--top-k ` | Top-K 过滤(0 = 关闭) |
+| `--top-p ` | Nucleus 采样阈值(1.0 = 关闭) |
+| `--min-p ` | 最小概率过滤(0 = 关闭) |
+| `--repeat-penalty ` | 重复惩罚(1.0 = 无) |
+| `--presence-penalty ` | 存在惩罚(0 = 关闭) |
+| `--frequency-penalty ` | 频率惩罚(0 = 关闭) |
+| `--seed ` | 随机种子(-1 = 非确定性) |
+| `--stop ` | 停止序列(可重复指定) |
+| `--test` | 运行内置测试套件 |
+
+如果把多模态投影器文件放在模型文件同目录并使用可识别命名(例如 `gemma-4-mmproj-F16.gguf`),系统会自动检测。
+
+**JSONL 输入格式:**
+
+每行是一个 JSON 对象,包含 `messages`、可选 `prompt` 和可选采样参数:
+
+```json
+{"id": "q1", "messages": [{"role": "user", "content": "What is 2+3?"}], "max_tokens": 50}
+{"id": "q2", "messages": [{"role": "user", "content": "Write a haiku."}], "max_tokens": 100, "temperature": 0.8}
+```
+
+### Web 应用
+
+```bash
+cd TensorSharp.Server/bin
+
+# 设置环境变量并运行
+MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server
+
+# Linux + NVIDIA GPU
+MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server
+```
+
+在浏览器中打开 `http://localhost:5000`。Web 界面支持:
+
+- 多轮聊天
+- 从 `MODEL_DIR` 中可用 GGUF 文件列表选择模型
+- 上传图像、视频和音频进行多模态推理(最大 500 MB)
+- 思维链/推理模式切换
+- 带函数定义的工具调用
+- 通过 Server-Sent Events 进行流式 token 生成
+- 带实时排队位置反馈的请求队列
+- 消息编辑和删除,支持从对话中任意位置重新生成
+
+**环境变量:**
+
+| 变量 | 说明 |
+|---|---|
+| `MODEL_DIR` | GGUF 模型文件所在目录 |
+| `BACKEND` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`(默认:macOS 为 `ggml_metal`,其他平台为 `ggml_cpu`) |
+| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限(默认:`4`) |
+| `PORT` | HTTP 端口(默认:`5000`) |
+
+### HTTP API
+
+TensorSharp.Server 暴露三种 API 风格。完整文档及 curl/Python 示例见 [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md)。
+
+**兼容 Ollama 的 API:**
+
+```bash
+# 列出模型
+curl http://localhost:5000/api/tags
+
+# 文本生成
+curl -X POST http://localhost:5000/api/generate \
+ -H "Content-Type: application/json" \
+ -d '{"model": "Qwen3-4B-Q8_0.gguf", "prompt": "Hello!", "stream": false}'
+
+# 聊天
+curl -X POST http://localhost:5000/api/chat/ollama \
+ -H "Content-Type: application/json" \
+ -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "stream": false}'
+
+# 启用思维链模式的聊天
+curl -X POST http://localhost:5000/api/chat/ollama \
+ -H "Content-Type: application/json" \
+ -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "计算 17*23"}], "think": true, "stream": false}'
+
+# 带工具调用的聊天
+curl -X POST http://localhost:5000/api/chat/ollama \
+ -H "Content-Type: application/json" \
+ -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "天气怎么样?"}], "tools": [{"function": {"name": "get_weather", "description": "获取当前天气", "parameters": {"properties": {"city": {"type": "string"}}, "required": ["city"]}}}], "stream": false}'
+```
+
+**兼容 OpenAI 的 API:**
+
+```bash
+# Chat completions
+curl -X POST http://localhost:5000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 50}'
+```
+
+**OpenAI Python SDK:**
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:5000/v1", api_key="not-needed")
+response = client.chat.completions.create(
+ model="Qwen3-4B-Q8_0.gguf",
+ messages=[{"role": "user", "content": "What is 2+3?"}],
+ max_tokens=50
+)
+print(response.choices[0].message.content)
+```
+
+**队列状态:**
+
+```bash
+curl http://localhost:5000/api/queue/status
+# {"busy":false,"pending_requests":0,"total_processed":42}
+```
+
+## 思维链 / 推理模式
+
+支持思维链模式的模型(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H)可以在生成最终答案之前产出结构化的思维链推理内容。思维内容与主要回复分开,客户端可选择显示或隐藏。
+
+- **Qwen 3 / Qwen 3.5 / Nemotron-H:** 使用 `...` 标签
+- **Gemma 4:** 使用 `<|channel>thought\n...` 标签
+- **GPT OSS:** 使用 Harmony 格式,以 `<|channel|>analysis` 标记思维过程,以 `<|channel|>final` 标记最终回复
+
+通过 `--think`(控制台)、`"think": true`(Ollama API)或 Web 界面中的思维链开关启用。
+
+## 工具调用 / 函数调用
+
+模型可以调用用户定义的工具并参与多轮工具调用对话。将工具定义为 JSON 格式,通过 `--tools`(控制台)或 API 中的 `tools` 参数传入。
+
+各架构使用各自的工具调用格式:
+
+- **Qwen 3 / Qwen 3.5 / Nemotron-H:** `{"name": "...", "arguments": {...}}`
+- **Gemma 4:** `<|tool_call>call:function_name{args}`
+
+输出解析器(`OutputParser.cs`)会自动从模型原始输出中提取工具调用,与架构无关。
+
+## 多模态支持
+
+### Gemma 4
+
+Gemma 4 模型支持图像、视频和音频输入。将多模态投影器(`gemma-4-mmproj-F16.gguf`)放在与模型文件相同目录即可自动加载。
+
+- **图像:** PNG、JPEG
+- **视频:** MP4(使用 OpenCV 以 1 fps 抽取最多 8 帧)
+- **音频:** WAV(16kHz 单声道)、MP3、OGG Vorbis
+
+### Gemma 3 / Qwen 3.5
+
+这两类模型支持图像输入,并需要对应的多模态投影器文件。
+
+### Mistral 3
+
+Mistral 3 通过 Pixtral 视觉编码器支持图像输入。将多模态投影器(`mistral3-mmproj.gguf`)放在与模型文件相同目录即可自动加载。
+
+- **图像:** PNG、JPEG
+
+## 架构说明
+
+TensorSharp 采用分层系统结构:
+
+1. **TensorSharp.Core** 提供核心 `Tensor` 类型、存储抽象和可扩展的操作注册表(`Ops`)。CPU 实现使用 `System.Numerics.Vectors` 进行 SIMD 加速。
+
+2. **TensorSharp.Runtime** 负责运行时契约与通用服务:GGUF 解析、分词(SentencePiece / BPE)、聊天模板渲染、可配置 token 采样、输出解析,以及 `IModelArchitecture`、`IPromptRenderer`、`IOutputProtocolParser`、`IMultimodalInjector`、`IKVCachePolicy`、`IBackendExecutionPlan` 等抽象。
+
+3. **TensorSharp.Models** 实现 `ModelBase` 以及各具体模型架构和多模态辅助组件(Gemma 3/4、Qwen 3/3.5、GPT OSS、Nemotron-H、Mistral 3)。模型通过 `ModelBase.Create()` 加载,并依据 GGUF 元数据自动识别架构。
+
+4. **TensorSharp.Backends.GGML** 通过原生 C++ 桥接库(`libGgmlOps`)注册同名操作的加速实现,并链接 [ggml](https://github.com/ggml-org/ggml)。在 macOS 上可提供 Metal GPU 计算,在 Linux 上可启用面向 NVIDIA GPU 的 GGML CUDA。操作包括原生量化 matmul(Q4_K_M、Q8_0 等),无需反量化到 FP32。
+
+5. **TensorSharp.Server** 是 HTTP / 应用层,提供兼容 Ollama 与 OpenAI 的 REST API、浏览器聊天 UI、上传处理和 FIFO 推理队列。
+
+6. **TensorSharp.Cli** 是控制台 / 应用层,用于本地 prompt 运行、多模态实验、prompt 检查和 JSONL 批处理。
+
+### 性能优化
+
+- **融合 GPU decode**(Gemma 4):在 Metal 上将所有 Transformer 层合并为单次 GGML 计算图调度,将每个 token 的 CPU-GPU 往返从数百次降低到一次。相较逐算子调度约提升 2.6 倍。
+- **融合权重投影**:Q/K/V 投影融合为单次 QKV matmul;gate 与 up 投影融合为单次 gate_up matmul。
+- **原生量化计算**:量化权重(Q4_K_M、Q6_K、Q8_0 等)直接参与 matmul,无需展开为 FP32,节省内存与带宽。
+- **优化后的纯 C# CPU 路径**:托管 GEMM 快速路径和连续 Float32 内核加速了 decode、softmax、RMSNorm、RoPE、融合激活等热点路径,同时在 CPU 加载时保持量化 GGUF 权重压缩状态。
+- **环形 KV 缓存**:滑动窗口注意力层使用固定大小环形缓冲区,使内存占用不随序列长度增长。
+- **高内存效率模型加载**:大张量直接流式加载到原生内存,避免中间托管内存分配。
+
+## 测试
+
+TensorSharp.Server 的集成测试位于 `TensorSharp.Server/testdata/`。测试覆盖所有三种 API 风格(Web UI SSE、Ollama、OpenAI)、多轮对话、思维链模式、工具调用、队列行为、并发请求和中断支持。
+
+```bash
+# 先启动 TensorSharp.Server,然后运行:
+python3 TensorSharp.Server/testdata/test_multiturn.py
+# 或
+bash TensorSharp.Server/testdata/test_multiturn.sh
+```
+
+完整测试矩阵见 [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md)。
+
+## 作者
+
+Zhongkai Fu
+
+## 许可证
+
+详见 [LICENSE](LICENSE)。
+
diff --git a/TensorSharp.GGML/GgmlAllocator.cs b/TensorSharp.Backends.GGML/GgmlAllocator.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlAllocator.cs
rename to TensorSharp.Backends.GGML/GgmlAllocator.cs
diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.Backends.GGML/GgmlBasicOps.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlBasicOps.cs
rename to TensorSharp.Backends.GGML/GgmlBasicOps.cs
diff --git a/TensorSharp.GGML/GgmlContext.cs b/TensorSharp.Backends.GGML/GgmlContext.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlContext.cs
rename to TensorSharp.Backends.GGML/GgmlContext.cs
diff --git a/TensorSharp.GGML/GgmlGgufTensorDequant.cs b/TensorSharp.Backends.GGML/GgmlGgufTensorDequant.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlGgufTensorDequant.cs
rename to TensorSharp.Backends.GGML/GgmlGgufTensorDequant.cs
diff --git a/TensorSharp.GGML/GgmlLossOps.cs b/TensorSharp.Backends.GGML/GgmlLossOps.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlLossOps.cs
rename to TensorSharp.Backends.GGML/GgmlLossOps.cs
diff --git a/TensorSharp.GGML/GgmlMemoryPool.cs b/TensorSharp.Backends.GGML/GgmlMemoryPool.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlMemoryPool.cs
rename to TensorSharp.Backends.GGML/GgmlMemoryPool.cs
diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.Backends.GGML/GgmlNative.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlNative.cs
rename to TensorSharp.Backends.GGML/GgmlNative.cs
diff --git a/TensorSharp.GGML/GgmlStorage.cs b/TensorSharp.Backends.GGML/GgmlStorage.cs
similarity index 100%
rename from TensorSharp.GGML/GgmlStorage.cs
rename to TensorSharp.Backends.GGML/GgmlStorage.cs
diff --git a/TensorSharp.GGML/TensorSharp.GGML.csproj b/TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj
similarity index 92%
rename from TensorSharp.GGML/TensorSharp.GGML.csproj
rename to TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj
index 2d93ad3..c272991 100644
--- a/TensorSharp.GGML/TensorSharp.GGML.csproj
+++ b/TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj
@@ -5,6 +5,8 @@
false
false
bin\
+ GGML backend integration for TensorSharp model execution.
+ tensor;backend;ggml;native
true
@@ -27,12 +29,11 @@
true
-
-
+
diff --git a/TensorSharp.Cli/GlobalUsings.cs b/TensorSharp.Cli/GlobalUsings.cs
new file mode 100644
index 0000000..df7cdd3
--- /dev/null
+++ b/TensorSharp.Cli/GlobalUsings.cs
@@ -0,0 +1,2 @@
+global using TensorSharp.Models;
+global using TensorSharp.Runtime;
diff --git a/InferenceConsole/Program.cs b/TensorSharp.Cli/Program.cs
similarity index 92%
rename from InferenceConsole/Program.cs
rename to TensorSharp.Cli/Program.cs
index 5737f51..35254b7 100644
--- a/InferenceConsole/Program.cs
+++ b/TensorSharp.Cli/Program.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+// Copyright (c) Zhongkai Fu. All rights reserved.
// https://github.com/zhongkaifu/TensorSharp
//
// This file is part of TensorSharp.
@@ -14,14 +14,15 @@
using System.Linq;
using System.Text;
using System.Text.Json;
-using InferenceEngine;
using TensorSharp;
using TensorSharp.Cpu;
-namespace InferenceConsole
+namespace TensorSharp.Cli
{
class Program
{
+ private static readonly IPromptRenderer PromptRenderer = new GgufPromptRenderer();
+
static void Main(string[] args)
{
Console.OutputEncoding = Encoding.UTF8;
@@ -112,7 +113,7 @@ static void Main(string[] args)
if (modelPath == null || !File.Exists(modelPath))
{
Console.Error.WriteLine($"Model file not found: {modelPath ?? "(none)"}");
- Console.Error.WriteLine("Usage: InferenceConsole --model [--input ] " +
+ Console.Error.WriteLine("Usage: TensorSharp.Cli --model [--input ] " +
"[--input-jsonl ] [--image ] [--output ] " +
"[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal|ggml_cuda]");
return;
@@ -129,49 +130,48 @@ static void Main(string[] args)
using var model = ModelBase.Create(modelPath, backend);
- if (mmProjPath != null && model is Gemma3Model gemma3WithVision)
- {
- gemma3WithVision.LoadVisionEncoder(mmProjPath);
- }
- else if (mmProjPath != null && model is Gemma4Model gemma4WithVision)
+ if (mmProjPath != null)
{
- gemma4WithVision.LoadVisionEncoder(mmProjPath);
- if (audioPath != null)
- gemma4WithVision.LoadAudioEncoder(mmProjPath);
- }
- else if (mmProjPath != null && model is Qwen35Model qwen35WithVision)
- {
- qwen35WithVision.LoadVisionEncoder(mmProjPath);
+ model.MultimodalInjector.LoadProjectors(mmProjPath);
}
else if (imagePath != null && model.Config.Architecture == "gemma3")
{
string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "mmproj-gemma3-4b-f16.gguf");
- if (File.Exists(autoMmproj) && model is Gemma3Model g3auto)
+ if (File.Exists(autoMmproj))
{
Console.WriteLine($"Auto-loading vision encoder: {autoMmproj}");
- g3auto.LoadVisionEncoder(autoMmproj);
+ model.MultimodalInjector.LoadProjectors(autoMmproj);
+ }
+ }
+ else if (imagePath != null && model.Config.Architecture == "mistral3")
+ {
+ string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "mistral3-mmproj.gguf");
+ if (File.Exists(autoMmproj))
+ {
+ Console.WriteLine($"Auto-loading Mistral3 vision encoder: {autoMmproj}");
+ model.MultimodalInjector.LoadProjectors(autoMmproj);
}
}
else if ((imagePath != null || audioPath != null || videoPath != null)
&& model.Config.Architecture == "gemma4")
{
string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "gemma-4-mmproj-F16.gguf");
- if (File.Exists(autoMmproj) && model is Gemma4Model g4auto)
+ if (File.Exists(autoMmproj))
{
Console.WriteLine($"Auto-loading multimodal encoder: {autoMmproj}");
- if (imagePath != null || videoPath != null)
- g4auto.LoadVisionEncoder(autoMmproj);
- if (audioPath != null)
- g4auto.LoadAudioEncoder(autoMmproj);
+ model.MultimodalInjector.LoadProjectors(autoMmproj);
}
}
- else if (imagePath != null && model is Qwen35Model q35auto)
+ else if (imagePath != null &&
+ (model.Config.Architecture == "qwen35" ||
+ model.Config.Architecture == "qwen35moe" ||
+ model.Config.Architecture == "qwen3next"))
{
string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "Qwen3.5-mmproj-F16.gguf");
if (File.Exists(autoMmproj))
{
Console.WriteLine($"Auto-loading vision encoder: {autoMmproj}");
- q35auto.LoadVisionEncoder(autoMmproj);
+ model.MultimodalInjector.LoadProjectors(autoMmproj);
}
}
@@ -249,7 +249,7 @@ static void Main(string[] args)
{
new ChatMessage { Role = "user", Content = rawText }
};
- string rendered = ChatTemplate.RenderFromGgufTemplate(
+ string rendered = PromptRenderer.Render(
model.Config.ChatTemplate, dumpMessages, addGenerationPrompt: true,
architecture: model.Config.Architecture, tools: tools, enableThinking: enableThinking);
Console.WriteLine("=== Rendered Prompt ===");
@@ -325,7 +325,7 @@ static void RunMultiTurnTest(ModelBase model, string jsonlPath, int maxTokens,
history.Add(new ChatMessage { Role = "user", Content = userMsg });
Console.WriteLine($"\n[Turn {turn + 1}/{lines.Length}] User: {userMsg}");
- string rendered = ChatTemplate.RenderFromGgufTemplate(
+ string rendered = PromptRenderer.Render(
model.Config.ChatTemplate, history, addGenerationPrompt: true,
architecture: arch, enableThinking: enableThinking);
@@ -485,7 +485,7 @@ static void RunJsonlBatch(ModelBase model, string inputJsonlPath, string outputF
bool reqThinking = enableThinking ||
(root.TryGetProperty("enable_thinking", out var etProp) && etProp.GetBoolean());
- string rendered = ChatTemplate.RenderFromGgufTemplate(
+ string rendered = PromptRenderer.Render(
model.Config.ChatTemplate, messages, addGenerationPrompt: true,
architecture: model.Config.Architecture, enableThinking: reqThinking);
@@ -655,7 +655,7 @@ static string RunInference(ModelBase model, string rawText, List imagePa
new ChatMessage { Role = "user", Content = rawText, ImagePaths = imagePaths, AudioPaths = audioPaths, IsVideo = isVideo }
};
- string rendered = ChatTemplate.RenderFromGgufTemplate(
+ string rendered = PromptRenderer.Render(
model.Config.ChatTemplate, messages, addGenerationPrompt: true,
architecture: model.Config.Architecture,
tools: tools, enableThinking: enableThinking);
@@ -790,6 +790,69 @@ static string RunInference(ModelBase model, string rawText, List imagePa
Console.WriteLine("Note: No vision encoder loaded. Use --mmproj to specify the vision encoder GGUF.");
}
}
+ else if (arch == "mistral3")
+ {
+ if (model is Mistral3Model m3 && m3.VisionEncoder != null)
+ {
+ var proc = new Mistral3ImageProcessor(
+ m3.VisionEncoder.ImageSize,
+ m3.VisionEncoder.PatchSize);
+
+ int imgTokenId = Mistral3ImageProcessor.ImgTokenId;
+ int imgBreakId = Mistral3ImageProcessor.ImgBreakTokenId;
+ int imgEndId = Mistral3ImageProcessor.ImgEndTokenId;
+
+ foreach (var imgP in imagePaths)
+ {
+ var (pixels, imgW, imgH) = proc.ProcessImage(imgP);
+ var visionEmb = m3.VisionEncoder.Encode(pixels, imgW, imgH);
+ int numRows = imgH / m3.VisionEncoder.PatchSize / m3.VisionEncoder.SpatialMergeSize;
+ int numCols = imgW / m3.VisionEncoder.PatchSize / m3.VisionEncoder.SpatialMergeSize;
+
+ int tokenPosition = -1;
+ for (int i = 0; i < inputTokens.Count; i++)
+ {
+ if (inputTokens[i] == imgTokenId)
+ {
+ tokenPosition = i;
+ break;
+ }
+ }
+
+ if (tokenPosition >= 0)
+ {
+ var expanded = new List();
+ for (int i = 0; i < tokenPosition; i++)
+ expanded.Add(inputTokens[i]);
+
+ for (int row = 0; row < numRows; row++)
+ {
+ for (int col = 0; col < numCols; col++)
+ expanded.Add(imgTokenId);
+ expanded.Add(row == numRows - 1 ? imgEndId : imgBreakId);
+ }
+
+ for (int i = tokenPosition + 1; i < inputTokens.Count; i++)
+ expanded.Add(inputTokens[i]);
+
+ m3.SetVisionEmbeddings(visionEmb, tokenPosition);
+ inputTokens = expanded;
+ Console.WriteLine($"Mistral3 vision: {numRows}x{numCols} merged patches, " +
+ $"{numRows * numCols + numRows} total tokens at pos {tokenPosition}");
+ }
+ else
+ {
+ visionEmb.Dispose();
+ Console.WriteLine("Warning: No [IMG] token found in prompt");
+ }
+ }
+ Console.WriteLine($"Total tokens after image expansion: {inputTokens.Count}");
+ }
+ else
+ {
+ Console.WriteLine("Note: No vision encoder loaded. Use --mmproj to specify the vision encoder GGUF.");
+ }
+ }
else
{
int imagePadId = model.Tokenizer.LookupToken("<|image_pad|>");
@@ -1382,3 +1445,6 @@ static string Escape(string s)
}
}
}
+
+
+
diff --git a/InferenceConsole/InferenceConsole.csproj b/TensorSharp.Cli/TensorSharp.Cli.csproj
similarity index 69%
rename from InferenceConsole/InferenceConsole.csproj
rename to TensorSharp.Cli/TensorSharp.Cli.csproj
index 6884313..c876cf9 100644
--- a/InferenceConsole/InferenceConsole.csproj
+++ b/TensorSharp.Cli/TensorSharp.Cli.csproj
@@ -5,6 +5,7 @@
true
false
bin\
+ Command-line host for TensorSharp model inference and diagnostics.
$(MSBuildProjectDirectory)/../TensorSharp.GGML.Native/build
@@ -12,7 +13,10 @@
libGgmlOps.so
-
+
+
+
+
diff --git a/InferenceConsole/test_requests.jsonl b/TensorSharp.Cli/test_requests.jsonl
similarity index 100%
rename from InferenceConsole/test_requests.jsonl
rename to TensorSharp.Cli/test_requests.jsonl
diff --git a/InferenceConsole/testdata/batch_thinking.jsonl b/TensorSharp.Cli/testdata/batch_thinking.jsonl
similarity index 100%
rename from InferenceConsole/testdata/batch_thinking.jsonl
rename to TensorSharp.Cli/testdata/batch_thinking.jsonl
diff --git a/InferenceConsole/testdata/example_api_thinking_tools.md b/TensorSharp.Cli/testdata/example_api_thinking_tools.md
similarity index 96%
rename from InferenceConsole/testdata/example_api_thinking_tools.md
rename to TensorSharp.Cli/testdata/example_api_thinking_tools.md
index 057a787..6f42bde 100644
--- a/InferenceConsole/testdata/example_api_thinking_tools.md
+++ b/TensorSharp.Cli/testdata/example_api_thinking_tools.md
@@ -1,4 +1,4 @@
-# Thinking Mode and Tool Call Examples
+# Thinking Mode and Tool Call Examples
## Console Application
@@ -8,11 +8,11 @@ Enable thinking mode with `--think`. The model will show its reasoning process b
```bash
# Basic thinking mode
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
--input testdata/input_thinking.txt --think --max-tokens 500
# Thinking mode with sampling
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
--input testdata/input_thinking.txt --think --max-tokens 500 \
--temperature 0.6 --top-p 0.95
```
@@ -23,17 +23,17 @@ Provide tool definitions via `--tools `. The model will output struct
```bash
# Weather tool call
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
--input testdata/input_tool_call.txt \
--tools testdata/tools_weather.json --max-tokens 300
# Calculator tool call
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
--input testdata/input_tool_calc.txt \
--tools testdata/tools_calculator.json --max-tokens 300
# Combined: thinking + tools
-./InferenceConsole --model model.gguf --backend ggml_metal \
+./TensorSharp.Cli --model model.gguf --backend ggml_metal \
--input testdata/input_tool_call.txt \
--tools testdata/tools_weather.json --think --max-tokens 500
```
@@ -347,3 +347,4 @@ When `tools` are provided:
1. **Gemma4**: Tool declarations use `<|tool>declaration:NAME{...}` format in the system turn. The model outputs calls as `<|tool_call>call:NAME{key:<|"|>value<|"|>}`.
2. **Qwen3**: Tool definitions are injected as JSON in the system message. The model outputs calls as `{"name":"...","arguments":{...}}`.
3. **Qwen3.5**: Tool definitions use `...` format. The model outputs calls as `\nvalue\n`.
+
diff --git a/InferenceConsole/testdata/input_thinking.txt b/TensorSharp.Cli/testdata/input_thinking.txt
similarity index 100%
rename from InferenceConsole/testdata/input_thinking.txt
rename to TensorSharp.Cli/testdata/input_thinking.txt
diff --git a/InferenceConsole/testdata/input_tool_calc.txt b/TensorSharp.Cli/testdata/input_tool_calc.txt
similarity index 100%
rename from InferenceConsole/testdata/input_tool_calc.txt
rename to TensorSharp.Cli/testdata/input_tool_calc.txt
diff --git a/InferenceConsole/testdata/input_tool_call.txt b/TensorSharp.Cli/testdata/input_tool_call.txt
similarity index 100%
rename from InferenceConsole/testdata/input_tool_call.txt
rename to TensorSharp.Cli/testdata/input_tool_call.txt
diff --git a/InferenceConsole/testdata/tools_calculator.json b/TensorSharp.Cli/testdata/tools_calculator.json
similarity index 100%
rename from InferenceConsole/testdata/tools_calculator.json
rename to TensorSharp.Cli/testdata/tools_calculator.json
diff --git a/InferenceConsole/testdata/tools_weather.json b/TensorSharp.Cli/testdata/tools_weather.json
similarity index 100%
rename from InferenceConsole/testdata/tools_weather.json
rename to TensorSharp.Cli/testdata/tools_weather.json
diff --git a/TensorSharp/Core/DelegateDisposable.cs b/TensorSharp.Core/Core/DelegateDisposable.cs
similarity index 100%
rename from TensorSharp/Core/DelegateDisposable.cs
rename to TensorSharp.Core/Core/DelegateDisposable.cs
diff --git a/TensorSharp/Core/TensorConcatenation.cs b/TensorSharp.Core/Core/TensorConcatenation.cs
similarity index 100%
rename from TensorSharp/Core/TensorConcatenation.cs
rename to TensorSharp.Core/Core/TensorConcatenation.cs
diff --git a/TensorSharp/Core/TensorResultBuilder.cs b/TensorSharp.Core/Core/TensorResultBuilder.cs
similarity index 100%
rename from TensorSharp/Core/TensorResultBuilder.cs
rename to TensorSharp.Core/Core/TensorResultBuilder.cs
diff --git a/TensorSharp/Cpu/CpuAllocator.cs b/TensorSharp.Core/Cpu/CpuAllocator.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuAllocator.cs
rename to TensorSharp.Core/Cpu/CpuAllocator.cs
diff --git a/TensorSharp/Cpu/CpuBasicOps.cs b/TensorSharp.Core/Cpu/CpuBasicOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuBasicOps.cs
rename to TensorSharp.Core/Cpu/CpuBasicOps.cs
diff --git a/TensorSharp/Cpu/CpuFillCopyOps.cs b/TensorSharp.Core/Cpu/CpuFillCopyOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuFillCopyOps.cs
rename to TensorSharp.Core/Cpu/CpuFillCopyOps.cs
diff --git a/TensorSharp/Cpu/CpuIndexingOps.cs b/TensorSharp.Core/Cpu/CpuIndexingOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuIndexingOps.cs
rename to TensorSharp.Core/Cpu/CpuIndexingOps.cs
diff --git a/TensorSharp/Cpu/CpuMaxPoolingOps.cs b/TensorSharp.Core/Cpu/CpuMaxPoolingOps.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuMaxPoolingOps.cs
rename to TensorSharp.Core/Cpu/CpuMaxPoolingOps.cs
diff --git a/TensorSharp/Cpu/CpuNativeHelpers.cs b/TensorSharp.Core/Cpu/CpuNativeHelpers.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuNativeHelpers.cs
rename to TensorSharp.Core/Cpu/CpuNativeHelpers.cs
diff --git a/TensorSharp/Cpu/CpuOpsNative.cs b/TensorSharp.Core/Cpu/CpuOpsNative.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuOpsNative.cs
rename to TensorSharp.Core/Cpu/CpuOpsNative.cs
diff --git a/TensorSharp/Cpu/CpuRandom.cs b/TensorSharp.Core/Cpu/CpuRandom.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuRandom.cs
rename to TensorSharp.Core/Cpu/CpuRandom.cs
diff --git a/TensorSharp/Cpu/CpuStorage.cs b/TensorSharp.Core/Cpu/CpuStorage.cs
similarity index 100%
rename from TensorSharp/Cpu/CpuStorage.cs
rename to TensorSharp.Core/Cpu/CpuStorage.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/DGEMM.cs b/TensorSharp.Core/Cpu/LinearAlgebra/DGEMM.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/DGEMM.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/DGEMM.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/LSAME.cs b/TensorSharp.Core/Cpu/LinearAlgebra/LSAME.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/LSAME.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/LSAME.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/SGEMM.cs b/TensorSharp.Core/Cpu/LinearAlgebra/SGEMM.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/SGEMM.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/SGEMM.cs
diff --git a/TensorSharp/Cpu/LinearAlgebra/XERBLA.cs b/TensorSharp.Core/Cpu/LinearAlgebra/XERBLA.cs
similarity index 100%
rename from TensorSharp/Cpu/LinearAlgebra/XERBLA.cs
rename to TensorSharp.Core/Cpu/LinearAlgebra/XERBLA.cs
diff --git a/TensorSharp/Cpu/MatrixMultiplication.cs b/TensorSharp.Core/Cpu/MatrixMultiplication.cs
similarity index 100%
rename from TensorSharp/Cpu/MatrixMultiplication.cs
rename to TensorSharp.Core/Cpu/MatrixMultiplication.cs
diff --git a/TensorSharp/Cpu/NativeWrapper.cs b/TensorSharp.Core/Cpu/NativeWrapper.cs
similarity index 100%
rename from TensorSharp/Cpu/NativeWrapper.cs
rename to TensorSharp.Core/Cpu/NativeWrapper.cs
diff --git a/TensorSharp/Cpu/OpenBlasNative.cs b/TensorSharp.Core/Cpu/OpenBlasNative.cs
similarity index 100%
rename from TensorSharp/Cpu/OpenBlasNative.cs
rename to TensorSharp.Core/Cpu/OpenBlasNative.cs
diff --git a/TensorSharp/Cpu/SpatialConvolutionMM.cs b/TensorSharp.Core/Cpu/SpatialConvolutionMM.cs
similarity index 100%
rename from TensorSharp/Cpu/SpatialConvolutionMM.cs
rename to TensorSharp.Core/Cpu/SpatialConvolutionMM.cs
diff --git a/TensorSharp/DType.cs b/TensorSharp.Core/DType.cs
similarity index 100%
rename from TensorSharp/DType.cs
rename to TensorSharp.Core/DType.cs
diff --git a/TensorSharp/Expression/SExpression.cs b/TensorSharp.Core/Expression/SExpression.cs
similarity index 100%
rename from TensorSharp/Expression/SExpression.cs
rename to TensorSharp.Core/Expression/SExpression.cs
diff --git a/TensorSharp/Expression/SVar.cs b/TensorSharp.Core/Expression/SVar.cs
similarity index 100%
rename from TensorSharp/Expression/SVar.cs
rename to TensorSharp.Core/Expression/SVar.cs
diff --git a/TensorSharp/Expression/TExpression.cs b/TensorSharp.Core/Expression/TExpression.cs
similarity index 100%
rename from TensorSharp/Expression/TExpression.cs
rename to TensorSharp.Core/Expression/TExpression.cs
diff --git a/TensorSharp/Expression/TVar.cs b/TensorSharp.Core/Expression/TVar.cs
similarity index 100%
rename from TensorSharp/Expression/TVar.cs
rename to TensorSharp.Core/Expression/TVar.cs
diff --git a/TensorSharp/Half.cs b/TensorSharp.Core/Half.cs
similarity index 100%
rename from TensorSharp/Half.cs
rename to TensorSharp.Core/Half.cs
diff --git a/TensorSharp/IAllocator.cs b/TensorSharp.Core/IAllocator.cs
similarity index 100%
rename from TensorSharp/IAllocator.cs
rename to TensorSharp.Core/IAllocator.cs
diff --git a/TensorSharp/IBasicOps.cs b/TensorSharp.Core/IBasicOps.cs
similarity index 100%
rename from TensorSharp/IBasicOps.cs
rename to TensorSharp.Core/IBasicOps.cs
diff --git a/TensorSharp/OpConstraint.cs b/TensorSharp.Core/OpConstraint.cs
similarity index 100%
rename from TensorSharp/OpConstraint.cs
rename to TensorSharp.Core/OpConstraint.cs
diff --git a/TensorSharp/OpRegistry.cs b/TensorSharp.Core/OpRegistry.cs
similarity index 100%
rename from TensorSharp/OpRegistry.cs
rename to TensorSharp.Core/OpRegistry.cs
diff --git a/TensorSharp/OpRegistryAttributes.cs b/TensorSharp.Core/OpRegistryAttributes.cs
similarity index 100%
rename from TensorSharp/OpRegistryAttributes.cs
rename to TensorSharp.Core/OpRegistryAttributes.cs
diff --git a/TensorSharp/Ops.cs b/TensorSharp.Core/Ops.cs
similarity index 100%
rename from TensorSharp/Ops.cs
rename to TensorSharp.Core/Ops.cs
diff --git a/TensorSharp/Properties/AssemblyInfo.cs b/TensorSharp.Core/Properties/AssemblyInfo.cs
similarity index 100%
rename from TensorSharp/Properties/AssemblyInfo.cs
rename to TensorSharp.Core/Properties/AssemblyInfo.cs
diff --git a/TensorSharp/Properties/PublishProfiles/FolderProfile.pubxml b/TensorSharp.Core/Properties/PublishProfiles/FolderProfile.pubxml
similarity index 100%
rename from TensorSharp/Properties/PublishProfiles/FolderProfile.pubxml
rename to TensorSharp.Core/Properties/PublishProfiles/FolderProfile.pubxml
diff --git a/TensorSharp/Properties/launchSettings.json b/TensorSharp.Core/Properties/launchSettings.json
similarity index 100%
rename from TensorSharp/Properties/launchSettings.json
rename to TensorSharp.Core/Properties/launchSettings.json
diff --git a/TensorSharp/RandomGenerator.cs b/TensorSharp.Core/RandomGenerator.cs
similarity index 100%
rename from TensorSharp/RandomGenerator.cs
rename to TensorSharp.Core/RandomGenerator.cs
diff --git a/TensorSharp/RefCounted.cs b/TensorSharp.Core/RefCounted.cs
similarity index 100%
rename from TensorSharp/RefCounted.cs
rename to TensorSharp.Core/RefCounted.cs
diff --git a/TensorSharp/ReflectionExtensions.cs b/TensorSharp.Core/ReflectionExtensions.cs
similarity index 100%
rename from TensorSharp/ReflectionExtensions.cs
rename to TensorSharp.Core/ReflectionExtensions.cs
diff --git a/TensorSharp/Storage.cs b/TensorSharp.Core/Storage.cs
similarity index 100%
rename from TensorSharp/Storage.cs
rename to TensorSharp.Core/Storage.cs
diff --git a/TensorSharp/Tensor.cs b/TensorSharp.Core/Tensor.cs
similarity index 100%
rename from TensorSharp/Tensor.cs
rename to TensorSharp.Core/Tensor.cs
diff --git a/TensorSharp/TensorApplyCPU.cs b/TensorSharp.Core/TensorApplyCPU.cs
similarity index 100%
rename from TensorSharp/TensorApplyCPU.cs
rename to TensorSharp.Core/TensorApplyCPU.cs
diff --git a/TensorSharp/TensorDimIterState.cs b/TensorSharp.Core/TensorDimIterState.cs
similarity index 100%
rename from TensorSharp/TensorDimIterState.cs
rename to TensorSharp.Core/TensorDimIterState.cs
diff --git a/TensorSharp/TensorDimensionHelpers.cs b/TensorSharp.Core/TensorDimensionHelpers.cs
similarity index 100%
rename from TensorSharp/TensorDimensionHelpers.cs
rename to TensorSharp.Core/TensorDimensionHelpers.cs
diff --git a/TensorSharp/TensorFormatting.cs b/TensorSharp.Core/TensorFormatting.cs
similarity index 100%
rename from TensorSharp/TensorFormatting.cs
rename to TensorSharp.Core/TensorFormatting.cs
diff --git a/TensorSharp/TensorIterState.cs b/TensorSharp.Core/TensorIterState.cs
similarity index 100%
rename from TensorSharp/TensorIterState.cs
rename to TensorSharp.Core/TensorIterState.cs
diff --git a/TensorSharp/TensorSerialization.cs b/TensorSharp.Core/TensorSerialization.cs
similarity index 100%
rename from TensorSharp/TensorSerialization.cs
rename to TensorSharp.Core/TensorSerialization.cs
diff --git a/TensorSharp/TensorSharp.csproj b/TensorSharp.Core/TensorSharp.Core.csproj
similarity index 93%
rename from TensorSharp/TensorSharp.csproj
rename to TensorSharp.Core/TensorSharp.Core.csproj
index 126e06e..e2bb2a3 100644
--- a/TensorSharp/TensorSharp.csproj
+++ b/TensorSharp.Core/TensorSharp.Core.csproj
@@ -5,6 +5,8 @@
false
false
bin\
+ TensorSharp core tensor primitives, ops, memory management, and device abstractions.
+ tensor;core;ops;memory;device
true
@@ -54,4 +56,4 @@
-
\ No newline at end of file
+
diff --git a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj
index 36d8be5..ed8a773 100644
--- a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj
+++ b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj
@@ -7,6 +7,7 @@
enable
-
+
+
diff --git a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs
index 1962d59..9cfea69 100644
--- a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs
+++ b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs
@@ -1,6 +1,7 @@
using System;
using System.IO;
-using InferenceEngine;
+using TensorSharp.Models;
+using TensorSharp.Runtime;
static BackendType ParseBackend(string backend) => backend.ToLowerInvariant() switch
{
diff --git a/TensorSharp.Models/BackendExecutionPlan.cs b/TensorSharp.Models/BackendExecutionPlan.cs
new file mode 100644
index 0000000..4fdb90d
--- /dev/null
+++ b/TensorSharp.Models/BackendExecutionPlan.cs
@@ -0,0 +1,32 @@
+// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+namespace TensorSharp.Models
+{
+ internal sealed class BackendExecutionPlan : IBackendExecutionPlan
+ {
+ public BackendExecutionPlan(BackendType backendType)
+ {
+ BackendType = backendType;
+ }
+
+ public BackendType BackendType { get; }
+
+ public bool UsesGgmlBackend =>
+ BackendType == BackendType.GgmlCpu ||
+ BackendType == BackendType.GgmlMetal ||
+ BackendType == BackendType.GgmlCuda;
+
+ public bool ShouldStoreWeightQuantized(GgufTensorInfo info)
+ {
+ return ModelBase.ShouldStoreWeightQuantized(BackendType, info);
+ }
+ }
+}
+
diff --git a/TensorSharp.Models/GlobalUsings.cs b/TensorSharp.Models/GlobalUsings.cs
new file mode 100644
index 0000000..e0a9f20
--- /dev/null
+++ b/TensorSharp.Models/GlobalUsings.cs
@@ -0,0 +1 @@
+global using TensorSharp.Runtime;
diff --git a/InferenceEngine/Half.cs b/TensorSharp.Models/Half.cs
similarity index 96%
rename from InferenceEngine/Half.cs
rename to TensorSharp.Models/Half.cs
index 395e4b7..2ac2393 100644
--- a/InferenceEngine/Half.cs
+++ b/TensorSharp.Models/Half.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+// Copyright (c) Zhongkai Fu. All rights reserved.
// https://github.com/zhongkaifu/TensorSharp
//
// This file is part of TensorSharp.
@@ -10,7 +10,7 @@
using System;
using System.Runtime.InteropServices;
-namespace InferenceEngine
+namespace TensorSharp.Models
{
[StructLayout(LayoutKind.Sequential)]
public struct half
@@ -71,3 +71,4 @@ private static float HalfToFloat(ushort value)
}
}
}
+
diff --git a/InferenceEngine/ManagedQuantizedOps.cs b/TensorSharp.Models/ManagedQuantizedOps.cs
similarity index 99%
rename from InferenceEngine/ManagedQuantizedOps.cs
rename to TensorSharp.Models/ManagedQuantizedOps.cs
index 751044f..70ca137 100644
--- a/InferenceEngine/ManagedQuantizedOps.cs
+++ b/TensorSharp.Models/ManagedQuantizedOps.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+// Copyright (c) Zhongkai Fu. All rights reserved.
// https://github.com/zhongkaifu/TensorSharp
//
// This file is part of TensorSharp.
@@ -12,7 +12,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-namespace InferenceEngine
+namespace TensorSharp.Models
{
internal static class ManagedQuantizedOps
{
@@ -696,3 +696,4 @@ private static unsafe void GetScaleMinK4(int j, byte* q, out byte d, out byte m)
}
}
}
+
diff --git a/InferenceEngine/MediaHelper.cs b/TensorSharp.Models/MediaHelper.cs
similarity index 98%
rename from InferenceEngine/MediaHelper.cs
rename to TensorSharp.Models/MediaHelper.cs
index 34e899a..3b3b6e0 100644
--- a/InferenceEngine/MediaHelper.cs
+++ b/TensorSharp.Models/MediaHelper.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+// Copyright (c) Zhongkai Fu. All rights reserved.
// https://github.com/zhongkaifu/TensorSharp
//
// This file is part of TensorSharp.
@@ -14,7 +14,7 @@
using System.Runtime.InteropServices;
using OpenCvSharp;
-namespace InferenceEngine
+namespace TensorSharp.Models
{
public static class MediaHelper
{
@@ -223,3 +223,4 @@ private static uint Crc32Png(byte[] type, byte[] data)
}
}
}
+
diff --git a/InferenceEngine/ModelBase.cs b/TensorSharp.Models/ModelBase.cs
similarity index 96%
rename from InferenceEngine/ModelBase.cs
rename to TensorSharp.Models/ModelBase.cs
index cb9ba9a..c0e62ff 100644
--- a/InferenceEngine/ModelBase.cs
+++ b/TensorSharp.Models/ModelBase.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Zhongkai Fu. All rights reserved.
+// Copyright (c) Zhongkai Fu. All rights reserved.
// https://github.com/zhongkaifu/TensorSharp
//
// This file is part of TensorSharp.
@@ -19,40 +19,8 @@
using TensorSharp.Cpu;
using TensorSharp.GGML;
-namespace InferenceEngine
+namespace TensorSharp.Models
{
- public enum BackendType
- {
- Cpu,
- GgmlCpu,
- GgmlMetal,
- GgmlCuda,
- }
-
- public class ModelConfig
- {
- public string Architecture { get; set; }
- public int HiddenSize { get; set; }
- public int NumHeads { get; set; }
- public int NumKVHeads { get; set; }
- public int KeyLength { get; set; }
- public int ValueLength { get; set; }
- public float Eps { get; set; }
- public float RopeBase { get; set; }
- public float RopeScale { get; set; } = 1f;
- public int NumLayers { get; set; }
- public int VocabSize { get; set; }
- public int IntermediateSize { get; set; }
- public string ChatTemplate { get; set; }
-
- public int NumExperts { get; set; }
- public int NumExpertsUsed { get; set; }
- public int SlidingWindow { get; set; }
- public int OriginalContextLength { get; set; }
-
- public int HeadDim => KeyLength > 0 ? KeyLength : (ValueLength > 0 ? ValueLength : HiddenSize / NumHeads);
- }
-
public class QuantizedWeight : IDisposable
{
public IntPtr Data { get; }
@@ -109,10 +77,13 @@ public static unsafe void FreeBuffer(IntPtr ptr)
}
}
- public abstract class ModelBase : IDisposable
+ public abstract class ModelBase : IModelArchitecture
{
public ModelConfig Config { get; protected set; }
public ITokenizer Tokenizer { get; protected set; }
+ public IKVCachePolicy KVCachePolicy { get; } = DefaultKvCachePolicy.Shared;
+ public IMultimodalInjector MultimodalInjector { get; }
+ public IBackendExecutionPlan ExecutionPlan { get; }
protected readonly GgufFile _gguf;
private readonly GgmlContext _ggmlContext;
@@ -124,8 +95,12 @@ public abstract class ModelBase : IDisposable
private bool _quantBackendReady;
protected int _cacheSeqLen;
+ protected int _maxContextLength;
protected float[] _logitsBuffer;
+ public int MaxContextLength => _maxContextLength;
+ public int CacheSeqLen => _cacheSeqLen;
+
// Timing
protected long _linearTicks;
protected long _attnTicks;
@@ -137,6 +112,8 @@ public abstract class ModelBase : IDisposable
protected ModelBase(string ggufPath, BackendType backend)
{
_backend = backend;
+ ExecutionPlan = new BackendExecutionPlan(backend);
+ MultimodalInjector = new ModelMultimodalInjector(this);
switch (backend)
{
case BackendType.GgmlCpu:
@@ -162,9 +139,7 @@ protected ModelBase(string ggufPath, BackendType backend)
_gguf = new GgufFile(ggufPath);
}
- protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu ||
- _backend == BackendType.GgmlMetal ||
- _backend == BackendType.GgmlCuda;
+ protected bool IsGgmlBackend => ExecutionPlan.UsesGgmlBackend;
protected void EnsureQuantBackendAvailable()
{
@@ -240,7 +215,7 @@ protected void ParseTokenizer()
protected virtual bool IsQuantizedLinearWeight(GgufTensorInfo info)
{
- return ShouldStoreWeightQuantized(_backend, info);
+ return ExecutionPlan.ShouldStoreWeightQuantized(info);
}
internal static bool ShouldStoreWeightQuantized(BackendType backend, GgufTensorInfo info)
@@ -970,8 +945,10 @@ public static ModelBase Create(string ggufPath, BackendType backend)
"gemma4" => new Gemma4Model(ggufPath, backend),
"gptoss" or "gpt-oss" => new GptOssModel(ggufPath, backend),
"nemotron_h" or "nemotron_h_moe" => new NemotronModel(ggufPath, backend),
+ "mistral3" => new Mistral3Model(ggufPath, backend),
_ => throw new NotSupportedException($"Unsupported architecture: {arch}"),
};
}
}
}
+
diff --git a/TensorSharp.Models/ModelMultimodalInjector.cs b/TensorSharp.Models/ModelMultimodalInjector.cs
new file mode 100644
index 0000000..6f12f46
--- /dev/null
+++ b/TensorSharp.Models/ModelMultimodalInjector.cs
@@ -0,0 +1,345 @@
+// Copyright (c) Zhongkai Fu. All rights reserved.
+// https://github.com/zhongkaifu/TensorSharp
+//
+// This file is part of TensorSharp.
+//
+// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree.
+//
+// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details.
+using System;
+using System.Collections.Generic;
+using TensorSharp;
+
+namespace TensorSharp.Models
+{
+ internal sealed class ModelMultimodalInjector : IMultimodalInjector
+ {
+ private readonly ModelBase _model;
+
+ public ModelMultimodalInjector(ModelBase model)
+ {
+ _model = model;
+ }
+
+ public void LoadProjectors(string mmProjPath)
+ {
+ if (string.IsNullOrWhiteSpace(mmProjPath))
+ return;
+
+ switch (_model)
+ {
+ case Gemma4Model g4:
+ g4.LoadVisionEncoder(mmProjPath);
+ g4.LoadAudioEncoder(mmProjPath);
+ break;
+ case Gemma3Model g3:
+ g3.LoadVisionEncoder(mmProjPath);
+ break;
+ case Qwen35Model q35:
+ q35.LoadVisionEncoder(mmProjPath);
+ break;
+ case Mistral3Model m3:
+ m3.LoadVisionEncoder(mmProjPath);
+ break;
+ }
+ }
+
+ public List ProcessPromptTokens(List history, List inputTokens)
+ {
+ if (history == null || history.Count == 0 || inputTokens == null || inputTokens.Count == 0)
+ return inputTokens;
+
+ if (_model is Gemma4Model g4)
+ return ProcessGemma4History(g4, history, inputTokens);
+ if (_model is Gemma3Model g3)
+ return ProcessGemma3History(g3, history, inputTokens);
+ if (_model is Qwen35Model q35)
+ return ProcessQwen35History(q35, history, inputTokens);
+ if (_model is Mistral3Model m3)
+ return ProcessMistral3History(m3, history, inputTokens);
+
+ return inputTokens;
+ }
+
+ private List ProcessGemma4History(Gemma4Model model, List history, List inputTokens)
+ {
+ int imageStartId = _model.Tokenizer.LookupToken("<|image>");
+ int imageEndId = _model.Tokenizer.LookupToken("");
+ if (imageStartId < 0) imageStartId = 255999;
+ if (imageEndId < 0) imageEndId = 256000;
+
+ int audioStartId = _model.Tokenizer.LookupToken("<|audio>");
+ int audioEndId = _model.Tokenizer.LookupToken("