diff --git a/InferenceWeb.Tests/BackendCatalogTests.cs b/InferenceWeb.Tests/BackendCatalogTests.cs index 401c8d0..d646838 100644 --- a/InferenceWeb.Tests/BackendCatalogTests.cs +++ b/InferenceWeb.Tests/BackendCatalogTests.cs @@ -1,6 +1,4 @@ -using InferenceEngine; -using InferenceWeb; -using TensorSharp.GGML; +using TensorSharp.GGML; namespace InferenceWeb.Tests; @@ -121,3 +119,5 @@ public void ShouldStoreWeightQuantized_GgmlBackendsKeepQuantizedWeights() Assert.True(shouldStoreQuantized); } } + + diff --git a/InferenceWeb.Tests/GlobalUsings.cs b/InferenceWeb.Tests/GlobalUsings.cs new file mode 100644 index 0000000..1f7dd10 --- /dev/null +++ b/InferenceWeb.Tests/GlobalUsings.cs @@ -0,0 +1,3 @@ +global using TensorSharp.Models; +global using TensorSharp.Runtime; +global using TensorSharp.Server; diff --git a/InferenceWeb.Tests/ImageProcessorTests.cs b/InferenceWeb.Tests/ImageProcessorTests.cs index 92dd8ca..d82af18 100644 --- a/InferenceWeb.Tests/ImageProcessorTests.cs +++ b/InferenceWeb.Tests/ImageProcessorTests.cs @@ -1,5 +1,4 @@ -using InferenceEngine; - + namespace InferenceWeb.Tests; public class ImageProcessorTests @@ -95,3 +94,4 @@ private static string WriteEmbeddedJpeg() return path; } } + diff --git a/InferenceWeb.Tests/InferenceWeb.Tests.csproj b/InferenceWeb.Tests/InferenceWeb.Tests.csproj index ca583c8..90960bd 100644 --- a/InferenceWeb.Tests/InferenceWeb.Tests.csproj +++ b/InferenceWeb.Tests/InferenceWeb.Tests.csproj @@ -15,7 +15,8 @@ - - + + + diff --git a/InferenceWeb.Tests/KVCacheTests.cs b/InferenceWeb.Tests/KVCacheTests.cs index c33c2cd..7bb4eea 100644 --- a/InferenceWeb.Tests/KVCacheTests.cs +++ b/InferenceWeb.Tests/KVCacheTests.cs @@ -1,5 +1,4 @@ -using InferenceWeb; - + namespace InferenceWeb.Tests; public class KVCacheTests @@ -129,3 +128,4 @@ public void FindTokenPrefixLength_ThinkingModelWithContentInContext() Assert.Equal(8, common); // Full cached is prefix } } + diff --git a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs index ffc8a6a..64e34ab 100644 --- a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs +++ b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs @@ -1,5 +1,4 @@ -using System.Buffers.Binary; -using InferenceEngine; +using System.Buffers.Binary; namespace InferenceWeb.Tests; @@ -168,3 +167,4 @@ private static float Dot(float[] lhs, float[] rhs, int rhsOffset, int length) return sum; } } + diff --git a/InferenceWeb.Tests/MediaHelperTests.cs b/InferenceWeb.Tests/MediaHelperTests.cs index 4be19b3..c2263a0 100644 --- a/InferenceWeb.Tests/MediaHelperTests.cs +++ b/InferenceWeb.Tests/MediaHelperTests.cs @@ -1,5 +1,4 @@ -using InferenceEngine; - + namespace InferenceWeb.Tests; public class MediaHelperTests @@ -71,3 +70,4 @@ public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride() } } } + diff --git a/InferenceWeb.Tests/ModelServiceHistoryTests.cs b/InferenceWeb.Tests/ModelServiceHistoryTests.cs index 4a01b1e..09d4bb8 100644 --- a/InferenceWeb.Tests/ModelServiceHistoryTests.cs +++ b/InferenceWeb.Tests/ModelServiceHistoryTests.cs @@ -1,6 +1,4 @@ -using InferenceEngine; -using InferenceWeb; - + namespace InferenceWeb.Tests; public class ModelServiceHistoryTests @@ -70,3 +68,5 @@ public void PrepareHistoryForInference_NormalizesEarlierVideoTurns() } } } + + diff --git a/InferenceWeb.Tests/StructuredOutputTests.cs b/InferenceWeb.Tests/StructuredOutputTests.cs index 658b4ec..3af4bce 100644 --- a/InferenceWeb.Tests/StructuredOutputTests.cs +++ b/InferenceWeb.Tests/StructuredOutputTests.cs @@ -1,6 +1,4 @@ -using System.Text.Json; -using InferenceEngine; -using InferenceWeb; +using System.Text.Json; namespace InferenceWeb.Tests; @@ -206,3 +204,5 @@ public void JsonSchemaNormalizationSupportsDefsAndAnyOf() Assert.Equal("""{"item":{"name":"Ada","age":30}}""", normalized.NormalizedContent); } } + + diff --git a/InferenceWeb.Tests/WebUiChatPolicyTests.cs b/InferenceWeb.Tests/WebUiChatPolicyTests.cs index eaebe36..0ba2a5c 100644 --- a/InferenceWeb.Tests/WebUiChatPolicyTests.cs +++ b/InferenceWeb.Tests/WebUiChatPolicyTests.cs @@ -1,5 +1,4 @@ -using InferenceWeb; - + namespace InferenceWeb.Tests; public class WebUiChatPolicyTests @@ -31,3 +30,4 @@ public void TryValidateChatRequest_RejectsPerTurnBackendSelection() Assert.Equal(WebUiChatPolicy.ModelSelectionLockedMessage, error); } } + diff --git a/README.md b/README.md index 8e05731..742ebf8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# TensorSharp +# TensorSharp

TensorSharp logo @@ -10,8 +10,8 @@ A C# inference engine for running large language models (LLMs) locally using GGU ## Features -- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H -- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5 +- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H, Mistral 3 +- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5 / Mistral 3 - **Thinking / reasoning mode** -- structured chain-of-thought output with `` / `<|channel>thought` / `<|channel>analysis` tags (Qwen 3, Qwen 3.5, Gemma 4, GPT OSS, Nemotron-H) - **Tool calling / function calling** -- models can invoke user-defined tools; multi-turn tool-call conversations supported across all three API styles - **Quantized model support** -- loads GGUF files with Q4_K_M, Q8_0, F16, MXFP4, and other quantization formats; performs native quantized matmul without dequantizing to FP32, including memory-efficient pure C# CPU loading for large GGUFs @@ -38,6 +38,7 @@ A C# inference engine for running large language models (LLMs) locally using GGU | Qwen 3.5 | Qwen3.5-9B, Qwen3.5-35B-A3B | Image | Yes | Yes | | GPT OSS | gpt-oss-20b (MoE) | Text only | Yes | No | | Nemotron-H | Nemotron-H-8B, Nemotron-H-47B (Hybrid SSM-Transformer, MoE) | Text only | Yes | Yes | +| Mistral 3 | Mistral-Small-3.1-24B-Instruct | Image | No | No | See [Model Architecture Cards](docs/model_cards.md) for detailed documentation of each architecture. @@ -58,6 +59,8 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you | GPT OSS | gpt-oss-20b (MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) | | Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) | | Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) | +| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) | +| Mistral 3 | mistral3-mmproj (Pixtral vision projector) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) | ## Compute Backends @@ -72,36 +75,37 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you ``` TensorSharp/ -├── TensorSharp/ # Core tensor library (CPU operations, SIMD) -├── TensorSharp.GGML/ # GGML backend bindings (Metal/CUDA/CPU via native library) +├── TensorSharp.Core/ # Core tensor library (Tensor, Ops, memory, device abstraction) +├── TensorSharp.Runtime/ # GGUF, tokenizers, templates, sampling, protocol parsing +├── TensorSharp.Models/ # Model architectures and multimodal encoders/injectors +├── TensorSharp.Backends.GGML/ # GGML backend bindings (Metal/CUDA/CPU via native library) ├── TensorSharp.GGML.Native/ # Native C++ bridge to ggml (builds libGgmlOps) -├── AdvUtils/ # Utility library -├── InferenceEngine/ # Model loading, tokenization, and inference logic -│ ├── Models/ -│ │ ├── Gemma3/ -│ │ ├── Gemma4/ # Vision encoder, audio encoder, MoE, fused GPU decode -│ │ ├── GptOss/ # MoE, attention sinks, SiLUAlphaLimit, Yarn RoPE -│ │ ├── Nemotron/ # Hybrid Mamba2 SSM + attention + MoE FFN -│ │ ├── Qwen3/ -│ │ └── Qwen35/ -│ ├── GgufReader.cs # GGUF file parser -│ ├── ModelBase.cs # Base class for all model architectures -│ ├── ChatTemplate.cs # Chat template rendering (hardcoded + Jinja2 from GGUF) -│ ├── Jinja2Template.cs # Jinja2 template renderer -│ ├── OutputParser.cs # Extracts thinking, content, and tool calls from model output -│ ├── SamplingConfig.cs # Sampling parameter configuration -│ ├── TokenSampler.cs # Token sampling (greedy, top-k, top-p, min-p, penalties) -│ └── MediaHelper.cs # Video frame extraction, audio decoding -├── InferenceConsole/ # CLI application -├── InferenceWeb/ # Web chatbot + API server (ASP.NET Core) +├── TensorSharp.Server/ # Web chatbot + API server (ASP.NET Core) │ ├── ModelService.cs # Model lifecycle management │ ├── InferenceQueue.cs # FIFO request queue with position tracking │ ├── wwwroot/index.html # Chat UI │ ├── testdata/ # Integration test suites (bash + Python) │ └── API_EXAMPLES.md # Detailed API documentation +├── TensorSharp.Cli/ # CLI application +├── AdvUtils/ # Utility library └── ExternalProjects/ # Third-party dependencies (ggml) ``` +## NuGet Packages + +The repository is now split along package boundaries so consumers can depend on only the layers they actually need. + +| Project | NuGet package | Public namespace | Responsibility | +|---|---|---|---| +| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor primitives, ops, allocators, storage, and device abstraction | +| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF parsing, tokenizers, prompt rendering, sampling, and output protocol parsing | +| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`, architecture implementations, multimodal encoders, and model-side execution helpers | +| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML-backed execution and native interop | +| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core server, OpenAI/Ollama adapters, queueing, and web UI | +| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | Console host and debugging / batch tooling | + +This split keeps engine users off the web stack, keeps API-layer changes from leaking into core/runtime packages, and makes future benchmark or eval-harness projects easier to publish independently. + ## Prerequisites - [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0) @@ -121,10 +125,10 @@ dotnet build TensorSharp.slnx ```bash # Console application -dotnet build InferenceConsole/InferenceConsole.csproj +dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj # Web application -dotnet build InferenceWeb/InferenceWeb.csproj +dotnet build TensorSharp.Server/TensorSharp.Server.csproj ``` ### Build the native GGML library @@ -166,7 +170,7 @@ TENSORSHARP_GGML_NATIVE_BUILD_PARALLEL_LEVEL=2 bash build-linux.sh --cuda You can also request a CUDA-enabled native build from `dotnet build`: ```bash -TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release +TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release ``` On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory. @@ -176,38 +180,38 @@ On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `bui ### Console Application ```bash -cd InferenceConsole/bin +cd TensorSharp.Cli/bin # Text inference -./InferenceConsole --model --input prompt.txt --output result.txt \ +./TensorSharp.Cli --model --input prompt.txt --output result.txt \ --max-tokens 200 --backend ggml_metal # Text inference on Linux + NVIDIA GPU -./InferenceConsole --model --input prompt.txt --output result.txt \ +./TensorSharp.Cli --model --input prompt.txt --output result.txt \ --max-tokens 200 --backend ggml_cuda # Image inference (Gemma 3/4, Qwen 3.5) -./InferenceConsole --model --image photo.png --backend ggml_metal +./TensorSharp.Cli --model --image photo.png --backend ggml_metal # Video inference (Gemma 4) -./InferenceConsole --model --video clip.mp4 --backend ggml_metal +./TensorSharp.Cli --model --video clip.mp4 --backend ggml_metal # Audio inference (Gemma 4) -./InferenceConsole --model --audio speech.wav --backend ggml_metal +./TensorSharp.Cli --model --audio speech.wav --backend ggml_metal # Thinking / reasoning mode -./InferenceConsole --model --input prompt.txt --backend ggml_metal --think +./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal --think # Tool calling -./InferenceConsole --model --input prompt.txt --backend ggml_metal \ +./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \ --tools tools.json # With sampling parameters -./InferenceConsole --model --input prompt.txt --backend ggml_metal \ +./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \ --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42 # Batch processing (JSONL) -./InferenceConsole --model --input-jsonl requests.jsonl \ +./TensorSharp.Cli --model --input-jsonl requests.jsonl \ --output results.txt --backend ggml_metal ``` @@ -253,13 +257,13 @@ Each line is a JSON object with `messages`, optional `prompt`, and optional samp ### Web Application ```bash -cd InferenceWeb/bin +cd TensorSharp.Server/bin # Set environment variables and run -MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb +MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server # Linux + NVIDIA GPU -MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb +MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server ``` Open `http://localhost:5000` in your browser. The web interface supports: @@ -284,7 +288,7 @@ Open `http://localhost:5000` in your browser. The web interface supports: ### HTTP APIs -InferenceWeb exposes three API styles. See [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md) for full documentation with curl and Python examples. +TensorSharp.Server exposes three API styles. See [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md) for full documentation with curl and Python examples. **Ollama-compatible API:** @@ -403,17 +407,27 @@ Gemma 4 models support image, video, and audio inputs. Place the multimodal proj These models support image inputs with their respective multimodal projector files. +### Mistral 3 + +Mistral 3 supports image inputs via the Pixtral vision encoder. Place the multimodal projector (`mistral3-mmproj.gguf`) in the same directory as the model file for automatic loading. + +- **Images:** PNG, JPEG + ## Architecture TensorSharp is structured as a layered system: -1. **TensorSharp** provides the core `Tensor` type, storage abstraction, and an extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration. +1. **TensorSharp.Core** provides the core `Tensor` type, storage abstraction, and the extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration. -2. **TensorSharp.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32. +2. **TensorSharp.Runtime** owns runtime-facing contracts and services: GGUF parsing, tokenization (SentencePiece / BPE), chat template rendering, configurable token sampling, output parsing, and reusable contracts such as `IModelArchitecture`, `IPromptRenderer`, `IOutputProtocolParser`, `IMultimodalInjector`, `IKVCachePolicy`, and `IBackendExecutionPlan`. -3. **InferenceEngine** implements model-specific logic: GGUF parsing, tokenization (SentencePiece BPE), chat template rendering (Jinja2 from GGUF metadata with hardcoded fallbacks), configurable token sampling, output parsing (thinking extraction, tool-call extraction), and the forward pass for each architecture (including hybrid SSM-Transformer models like Nemotron-H with Mamba2 layers). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata. +3. **TensorSharp.Models** implements `ModelBase` plus the concrete architectures and multimodal helpers (Gemma 3/4, Qwen 3/3.5, GPT OSS, Nemotron-H, Mistral 3). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata. -4. **InferenceConsole** and **InferenceWeb** are application layers that handle I/O and user interaction. InferenceWeb provides Ollama-compatible and OpenAI-compatible REST APIs alongside a browser-based chat UI, with a FIFO inference queue to serialize concurrent requests. +4. **TensorSharp.Backends.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32. + +5. **TensorSharp.Server** is the HTTP/application layer. It provides Ollama-compatible and OpenAI-compatible REST APIs, the browser-based chat UI, upload handling, and the FIFO inference queue. + +6. **TensorSharp.Cli** is the console/application layer for local prompts, multimodal experiments, prompt inspection, and JSONL batch workflows. ### Performance Optimizations @@ -426,16 +440,16 @@ TensorSharp is structured as a layered system: ## Testing -Integration tests for InferenceWeb are in `InferenceWeb/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support. +Integration tests for TensorSharp.Server are in `TensorSharp.Server/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support. ```bash -# Start InferenceWeb, then run: -python3 InferenceWeb/testdata/test_multiturn.py +# Start TensorSharp.Server, then run: +python3 TensorSharp.Server/testdata/test_multiturn.py # or -bash InferenceWeb/testdata/test_multiturn.sh +bash TensorSharp.Server/testdata/test_multiturn.sh ``` -See [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md) for the full test matrix. +See [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md) for the full test matrix. ## Author @@ -444,3 +458,4 @@ Zhongkai Fu ## License See [LICENSE](LICENSE) for details. + diff --git a/README_zh-cn.md b/README_zh-cn.md index fdbf14e..9fb6ce8 100644 --- a/README_zh-cn.md +++ b/README_zh-cn.md @@ -1,411 +1,426 @@ -# TensorSharp - -

- TensorSharp logo -

- -[English](README.md) | [中文](README_zh-cn.md) - -一个用于在本地运行大型语言模型(LLM)的 C# 推理引擎,使用 GGUF 模型文件。TensorSharp 提供控制台应用、基于 Web 的聊天界面,以及兼容 Ollama/OpenAI 的 HTTP API 以便程序化调用。 - -## 功能特性 - -- **多架构支持** —— Gemma 4、Gemma 3、Qwen 3、Qwen 3.5、GPT OSS、Nemotron-H -- **多模态推理** —— 图像、视频和音频输入(Gemma 4);图像输入(Gemma 3 / Qwen 3.5) -- **思维链 / 推理模式** —— 通过 `` / `<|channel>thought` / `<|channel>analysis` 标签输出结构化的思维链推理(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H) -- **工具调用 / 函数调用** —— 模型可调用用户定义的工具;所有三种 API 风格均支持多轮工具调用对话 -- **量化模型支持** —— 加载 Q4_K_M、Q8_0、F16、MXFP4 等量化格式的 GGUF 文件;执行原生量化矩阵乘法(matmul),无需反量化到 FP32,并且纯 C# CPU 后端在加载大型 GGUF 时也会保持量化权重压缩状态 -- **GPU 加速** —— 通过 GGML 支持 Apple Metal(macOS)和 GGML CUDA(Linux/NVIDIA);Gemma 4 在 Metal 上支持整模型融合 GPU decode(相对逐算子调度约提升 2.6 倍) -- **优化后的纯 C# CPU 后端** —— 为 GEMM、RMSNorm、RoPE、softmax、融合激活等推理热点路径提供托管快速路径和 SIMD 内核 -- **兼容 Ollama 与 OpenAI API** —— 可作为现有工具链的即插即用替代端点 -- **可配置采样** —— temperature、top-k、top-p、min-p、重复/存在/频率惩罚、seed、停止序列 -- **聊天模板** —— 从 GGUF 元数据自动加载(Jinja2),并为不同架构提供硬编码回退模板 -- **请求队列** —— FIFO 推理队列确保单请求执行以保障 KV 缓存稳定性,并为客户端提供实时排队位置反馈 -- **批处理** —— 控制台应用支持 JSONL 输入 -- **流式输出** —— 按 token 输出(Web 通过 SSE,控制台通过 stdout) -- **混合 SSM-Transformer** —— Nemotron-H 在单个模型中混合 Mamba2 SSM 层、纯注意力层和 MoE FFN 层 -- **专家混合(MoE)** —— 支持 Gemma 4 MoE 变体(例如 gemma-4-26B-A4B)、GPT OSS MoE(例如 gpt-oss-20b)、Nemotron-H MoE FFN 层 -- **消息编辑** —— 在 Web 聊天界面中编辑或删除历史消息,并从该位置重新生成回复 -- **大文件上传** —— Web 界面支持最大 500 MB 的视频/音频上传 - -## 支持的模型架构 - -| 架构 | 示例模型 | 多模态 | 思维链 | 工具调用 | -|---|---|---|---|---| -| Gemma 4 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B(MoE) | 图像、视频、音频 | 支持 | 支持 | -| Gemma 3 | gemma-3-4b | 图像 | 不支持 | 不支持 | -| Qwen 3 | Qwen3-4B | 仅文本 | 支持 | 支持 | -| Qwen 3.5 | Qwen3.5-9B、Qwen3.5-35B-A3B | 图像 | 支持 | 支持 | -| GPT OSS | gpt-oss-20b(MoE) | 仅文本 | 支持 | 不支持 | -| Nemotron-H | Nemotron-H-8B、Nemotron-H-47B(混合 SSM-Transformer,MoE) | 仅文本 | 支持 | 支持 | - -各架构的详细文档见[模型架构卡片](docs/model_cards_cn.md)。 - -## 模型下载(GGUF) - -TensorSharp 使用 GGUF 格式模型文件。以下是各架构对应的 Hugging Face 下载链接。请根据硬件条件选择合适的量化版本(Q4_K_M 适合低内存,Q8_0 适合更高质量等)。 - -| 架构 | 模型 | GGUF 下载 | -|---|---|---| -| Gemma 4 | gemma-4-E4B-it | [ggml-org/gemma-4-E4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF) | -| Gemma 4 | gemma-4-31B-it | [ggml-org/gemma-4-31B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-31B-it-GGUF) | -| Gemma 4 | gemma-4-26B-A4B-it(MoE) | [ggml-org/gemma-4-26B-A4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-26B-A4B-it-GGUF) | -| Gemma 4 | gemma-4-mmproj(多模态投影器) | 包含在上述 GGUF 仓库中 | -| Gemma 3 | gemma-3-4b-it | [google/gemma-3-4b-it-qat-q4_0-gguf](https://huggingface.co/google/gemma-3-4b-it-qat-q4_0-gguf) | -| Qwen 3 | Qwen3-4B | [Qwen/Qwen3-4B-GGUF](https://huggingface.co/Qwen/Qwen3-4B-GGUF) | -| Qwen 3.5 | Qwen3.5-9B | [unsloth/Qwen3.5-9B-GGUF](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) | -| Qwen 3.5 | Qwen3.5-35B-A3B | [ggml-org/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/ggml-org/Qwen3.5-35B-A3B-GGUF) | -| GPT OSS | gpt-oss-20b(MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) | -| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) | -| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) | - -## 计算后端 - -| 后端 | 参数 | 说明 | -|---|---|---| -| GGML Metal | `--backend ggml_metal` | 通过 Apple Metal(macOS)进行 GPU 加速。推荐用于 Apple Silicon。 | -| GGML CUDA | `--backend ggml_cuda` | 通过 GGML CUDA 在 Linux + NVIDIA GPU 上进行加速。 | -| GGML CPU | `--backend ggml_cpu` | 使用原生 GGML 与优化内核进行 CPU 推理。 | -| 纯 C# CPU | `--backend cpu` | 无原生依赖的可移植 CPU 推理。 | - -## 项目结构 - -```text -TensorSharp/ -├── TensorSharp/ # 核心张量库(CPU 运算、SIMD) -├── TensorSharp.GGML/ # GGML 后端绑定(通过原生库支持 Metal/CUDA/CPU) -├── TensorSharp.GGML.Native/ # 到 ggml 的原生 C++ 桥接(构建 libGgmlOps) -├── AdvUtils/ # 工具库 -├── InferenceEngine/ # 模型加载、分词和推理逻辑 -│ ├── Models/ -│ │ ├── Gemma3/ -│ │ ├── Gemma4/ # 视觉编码器、音频编码器、MoE、融合 GPU decode -│ │ ├── GptOss/ # MoE、注意力沉降、SiLUAlphaLimit、Yarn RoPE -│ │ ├── Nemotron/ # 混合 Mamba2 SSM + 注意力 + MoE FFN -│ │ ├── Qwen3/ -│ │ └── Qwen35/ -│ ├── GgufReader.cs # GGUF 文件解析器 -│ ├── ModelBase.cs # 各模型架构基类 -│ ├── ChatTemplate.cs # 聊天模板渲染(硬编码 + 来自 GGUF 的 Jinja2) -│ ├── Jinja2Template.cs # Jinja2 模板渲染器 -│ ├── OutputParser.cs # 从模型输出中提取思维链、内容和工具调用 -│ ├── SamplingConfig.cs # 采样参数配置 -│ ├── TokenSampler.cs # Token 采样(greedy、top-k、top-p、min-p、惩罚项) -│ └── MediaHelper.cs # 视频抽帧、音频解码 -├── InferenceConsole/ # CLI 应用 -├── InferenceWeb/ # Web 聊天 + API 服务(ASP.NET Core) -│ ├── ModelService.cs # 模型生命周期管理 -│ ├── InferenceQueue.cs # 带排队位置追踪的 FIFO 请求队列 -│ ├── wwwroot/index.html # 聊天界面 -│ ├── testdata/ # 集成测试套件(bash + Python) -│ └── API_EXAMPLES.md # 详细 API 文档 -└── ExternalProjects/ # 第三方依赖(ggml) -``` - -## 前置要求 - -- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0) -- **macOS(Metal 后端):** 用于构建原生 GGML 库的 CMake 3.20+ 与 Xcode 命令行工具 -- **Linux(GGML CPU / CUDA 后端):** CMake 3.20+;若使用 `ggml_cuda`,还需要 NVIDIA 驱动和 CUDA Toolkit 12.x 或其他兼容版本 -- GGUF 模型文件(例如来自 [Hugging Face](https://huggingface.co)) - -## 构建 - -### 构建整个解决方案 - -```bash -dotnet build TensorSharp.slnx -``` - -### 构建单独应用 - -```bash -# 控制台应用 -dotnet build InferenceConsole/InferenceConsole.csproj - -# Web 应用 -dotnet build InferenceWeb/InferenceWeb.csproj -``` - -### 构建原生 GGML 库 - -如果原生库不存在,首次执行 `dotnet build` 时会自动构建。也可以手动构建: - -```bash -cd TensorSharp.GGML.Native -``` - -macOS: - -```bash -bash build-macos.sh -``` - -Linux(仅 CPU): - -```bash -bash build-linux.sh -``` - -Linux(启用 GGML_CUDA): - -```bash -bash build-linux.sh --cuda -``` - -也可以在 `dotnet build` 时通过环境变量请求 CUDA 版本的原生库: - -```bash -TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release -``` - -在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上,`build-linux.sh` 会保留已有的 CUDA 构建,并在检测到 CUDA 工具链时自动启用 GGML_CUDA;也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。 - -## 使用方法 - -### 控制台应用 - -```bash -cd InferenceConsole/bin - -# 文本推理 -./InferenceConsole --model --input prompt.txt --output result.txt \ - --max-tokens 200 --backend ggml_metal - -# Linux + NVIDIA GPU 文本推理 -./InferenceConsole --model --input prompt.txt --output result.txt \ - --max-tokens 200 --backend ggml_cuda - -# 图像推理(Gemma 3/4,Qwen 3.5) -./InferenceConsole --model --image photo.png --backend ggml_metal - -# 视频推理(Gemma 4) -./InferenceConsole --model --video clip.mp4 --backend ggml_metal - -# 音频推理(Gemma 4) -./InferenceConsole --model --audio speech.wav --backend ggml_metal - -# 思维链 / 推理模式 -./InferenceConsole --model --input prompt.txt --backend ggml_metal --think - -# 工具调用 -./InferenceConsole --model --input prompt.txt --backend ggml_metal \ - --tools tools.json - -# 使用采样参数 -./InferenceConsole --model --input prompt.txt --backend ggml_metal \ - --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42 - -# 批处理(JSONL) -./InferenceConsole --model --input-jsonl requests.jsonl \ - --output results.txt --backend ggml_metal -``` - -**命令行参数:** - -| 参数 | 说明 | -|---|---| -| `--model ` | GGUF 模型文件路径(必填) | -| `--input ` | 包含用户提示词的文本文件 | -| `--input-jsonl ` | JSONL 批量请求文件(每行一个 JSON) | -| `--multi-turn-jsonl ` | 用于多轮对话模拟(含 KV 缓存复用)的 JSONL 文件 | -| `--output ` | 将生成文本写入该文件 | -| `--image ` | 用于视觉推理的图像文件 | -| `--video ` | 用于视频推理的视频文件 | -| `--audio ` | 音频文件(WAV、MP3、OGG)用于音频推理 | -| `--mmproj ` | 多模态投影器 GGUF 文件路径 | -| `--max-tokens ` | 最大生成 token 数(默认:100) | -| `--backend ` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda` | -| `--think` | 启用思维链/推理模式 | -| `--tools ` | 包含工具/函数定义的 JSON 文件 | -| `--temperature ` | 采样温度(0 = 贪心) | -| `--top-k ` | Top-K 过滤(0 = 关闭) | -| `--top-p ` | Nucleus 采样阈值(1.0 = 关闭) | -| `--min-p ` | 最小概率过滤(0 = 关闭) | -| `--repeat-penalty ` | 重复惩罚(1.0 = 无) | -| `--presence-penalty ` | 存在惩罚(0 = 关闭) | -| `--frequency-penalty ` | 频率惩罚(0 = 关闭) | -| `--seed ` | 随机种子(-1 = 非确定性) | -| `--stop ` | 停止序列(可重复指定) | -| `--test` | 运行内置测试套件 | - -如果把多模态投影器文件放在模型文件同目录并使用可识别命名(例如 `gemma-4-mmproj-F16.gguf`),系统会自动检测。 - -**JSONL 输入格式:** - -每行是一个 JSON 对象,包含 `messages`、可选 `prompt` 和可选采样参数: - -```json -{"id": "q1", "messages": [{"role": "user", "content": "What is 2+3?"}], "max_tokens": 50} -{"id": "q2", "messages": [{"role": "user", "content": "Write a haiku."}], "max_tokens": 100, "temperature": 0.8} -``` - -### Web 应用 - -```bash -cd InferenceWeb/bin - -# 设置环境变量并运行 -MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb - -# Linux + NVIDIA GPU -MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb -``` - -在浏览器中打开 `http://localhost:5000`。Web 界面支持: - -- 多轮聊天 -- 从 `MODEL_DIR` 中可用 GGUF 文件列表选择模型 -- 上传图像、视频和音频进行多模态推理(最大 500 MB) -- 思维链/推理模式切换 -- 带函数定义的工具调用 -- 通过 Server-Sent Events 进行流式 token 生成 -- 带实时排队位置反馈的请求队列 -- 消息编辑和删除,支持从对话中任意位置重新生成 - -**环境变量:** - -| 变量 | 说明 | -|---|---| -| `MODEL_DIR` | GGUF 模型文件所在目录 | -| `BACKEND` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`(默认:macOS 为 `ggml_metal`,其他平台为 `ggml_cpu`) | -| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限(默认:`4`) | -| `PORT` | HTTP 端口(默认:`5000`) | - -### HTTP API - -InferenceWeb 暴露三种 API 风格。完整文档及 curl/Python 示例见 [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md)。 - -**兼容 Ollama 的 API:** - -```bash -# 列出模型 -curl http://localhost:5000/api/tags - -# 文本生成 -curl -X POST http://localhost:5000/api/generate \ - -H "Content-Type: application/json" \ - -d '{"model": "Qwen3-4B-Q8_0.gguf", "prompt": "Hello!", "stream": false}' - -# 聊天 -curl -X POST http://localhost:5000/api/chat/ollama \ - -H "Content-Type: application/json" \ - -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "stream": false}' - -# 启用思维链模式的聊天 -curl -X POST http://localhost:5000/api/chat/ollama \ - -H "Content-Type: application/json" \ - -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "计算 17*23"}], "think": true, "stream": false}' - -# 带工具调用的聊天 -curl -X POST http://localhost:5000/api/chat/ollama \ - -H "Content-Type: application/json" \ - -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "天气怎么样?"}], "tools": [{"function": {"name": "get_weather", "description": "获取当前天气", "parameters": {"properties": {"city": {"type": "string"}}, "required": ["city"]}}}], "stream": false}' -``` - -**兼容 OpenAI 的 API:** - -```bash -# Chat completions -curl -X POST http://localhost:5000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 50}' -``` - -**OpenAI Python SDK:** - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:5000/v1", api_key="not-needed") -response = client.chat.completions.create( - model="Qwen3-4B-Q8_0.gguf", - messages=[{"role": "user", "content": "What is 2+3?"}], - max_tokens=50 -) -print(response.choices[0].message.content) -``` - -**队列状态:** - -```bash -curl http://localhost:5000/api/queue/status -# {"busy":false,"pending_requests":0,"total_processed":42} -``` - -## 思维链 / 推理模式 - -支持思维链模式的模型(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H)可以在生成最终答案之前产出结构化的思维链推理内容。思维内容与主要回复分开,客户端可选择显示或隐藏。 - -- **Qwen 3 / Qwen 3.5 / Nemotron-H:** 使用 `...` 标签 -- **Gemma 4:** 使用 `<|channel>thought\n...` 标签 -- **GPT OSS:** 使用 Harmony 格式,以 `<|channel|>analysis` 标记思维过程,以 `<|channel|>final` 标记最终回复 - -通过 `--think`(控制台)、`"think": true`(Ollama API)或 Web 界面中的思维链开关启用。 - -## 工具调用 / 函数调用 - -模型可以调用用户定义的工具并参与多轮工具调用对话。将工具定义为 JSON 格式,通过 `--tools`(控制台)或 API 中的 `tools` 参数传入。 - -各架构使用各自的工具调用格式: - -- **Qwen 3 / Qwen 3.5 / Nemotron-H:** `{"name": "...", "arguments": {...}}` -- **Gemma 4:** `<|tool_call>call:function_name{args}` - -输出解析器(`OutputParser.cs`)会自动从模型原始输出中提取工具调用,与架构无关。 - -## 多模态支持 - -### Gemma 4 - -Gemma 4 模型支持图像、视频和音频输入。将多模态投影器(`gemma-4-mmproj-F16.gguf`)放在与模型文件相同目录即可自动加载。 - -- **图像:** PNG、JPEG -- **视频:** MP4(使用 OpenCV 以 1 fps 抽取最多 8 帧) -- **音频:** WAV(16kHz 单声道)、MP3、OGG Vorbis - -### Gemma 3 / Qwen 3.5 - -这两类模型支持图像输入,并需要对应的多模态投影器文件。 - -## 架构说明 - -TensorSharp 采用分层系统结构: - -1. **TensorSharp** 提供核心 `Tensor` 类型、存储抽象和可扩展的操作注册表(`Ops`)。CPU 实现使用 `System.Numerics.Vectors` 进行 SIMD 加速。 - -2. **TensorSharp.GGML** 通过原生 C++ 桥接库(`libGgmlOps`)注册同名操作的加速实现,并链接 [ggml](https://github.com/ggml-org/ggml)。在 macOS 上可提供 Metal GPU 计算,在 Linux 上可启用面向 NVIDIA GPU 的 GGML CUDA。操作包括原生量化 matmul(Q4_K_M、Q8_0 等),无需反量化到 FP32。 - -3. **InferenceEngine** 实现模型相关逻辑:GGUF 解析、分词(SentencePiece BPE)、聊天模板渲染(来自 GGUF 元数据的 Jinja2 + 硬编码回退)、可配置 token 采样、输出解析(思维链提取、工具调用提取),以及各架构前向计算(包括 Nemotron-H 等混合 SSM-Transformer 模型的 Mamba2 层)。模型通过 `ModelBase.Create()` 加载,并依据 GGUF 元数据自动识别架构。 - -4. **InferenceConsole** 与 **InferenceWeb** 是应用层,负责 I/O 和用户交互。InferenceWeb 同时提供兼容 Ollama 与 OpenAI 的 REST API 以及浏览器聊天 UI,并使用 FIFO 推理队列来串行化并发请求。 - -### 性能优化 - -- **融合 GPU decode**(Gemma 4):在 Metal 上将所有 Transformer 层合并为单次 GGML 计算图调度,将每个 token 的 CPU-GPU 往返从数百次降低到一次。相较逐算子调度约提升 2.6 倍。 -- **融合权重投影**:Q/K/V 投影融合为单次 QKV matmul;gate 与 up 投影融合为单次 gate_up matmul。 -- **原生量化计算**:量化权重(Q4_K_M、Q6_K、Q8_0 等)直接参与 matmul,无需展开为 FP32,节省内存与带宽。 -- **优化后的纯 C# CPU 路径**:托管 GEMM 快速路径和连续 Float32 内核加速了 decode、softmax、RMSNorm、RoPE、融合激活等热点路径,同时在 CPU 加载时保持量化 GGUF 权重压缩状态。 -- **环形 KV 缓存**:滑动窗口注意力层使用固定大小环形缓冲区,使内存占用不随序列长度增长。 -- **高内存效率模型加载**:大张量直接流式加载到原生内存,避免中间托管内存分配。 - -## 测试 - -InferenceWeb 的集成测试位于 `InferenceWeb/testdata/`。测试覆盖所有三种 API 风格(Web UI SSE、Ollama、OpenAI)、多轮对话、思维链模式、工具调用、队列行为、并发请求和中断支持。 - -```bash -# 先启动 InferenceWeb,然后运行: -python3 InferenceWeb/testdata/test_multiturn.py -# 或 -bash InferenceWeb/testdata/test_multiturn.sh -``` - -完整测试矩阵见 [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md)。 - -## 作者 - -Zhongkai Fu - -## 许可证 - -详见 [LICENSE](LICENSE)。 +# TensorSharp + +

+ TensorSharp logo +

+ +[English](README.md) | [中文](README_zh-cn.md) + +一个用于在本地运行大型语言模型(LLM)的 C# 推理引擎,使用 GGUF 模型文件。TensorSharp 提供控制台应用、基于 Web 的聊天界面,以及兼容 Ollama/OpenAI 的 HTTP API 以便程序化调用。 + +## 功能特性 + +- **多架构支持** —— Gemma 4、Gemma 3、Qwen 3、Qwen 3.5、GPT OSS、Nemotron-H、Mistral 3 +- **多模态推理** —— 图像、视频和音频输入(Gemma 4);图像输入(Gemma 3 / Qwen 3.5 / Mistral 3) +- **思维链 / 推理模式** —— 通过 `` / `<|channel>thought` / `<|channel>analysis` 标签输出结构化的思维链推理(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H) +- **工具调用 / 函数调用** —— 模型可调用用户定义的工具;所有三种 API 风格均支持多轮工具调用对话 +- **量化模型支持** —— 加载 Q4_K_M、Q8_0、F16、MXFP4 等量化格式的 GGUF 文件;执行原生量化矩阵乘法(matmul),无需反量化到 FP32,并且纯 C# CPU 后端在加载大型 GGUF 时也会保持量化权重压缩状态 +- **GPU 加速** —— 通过 GGML 支持 Apple Metal(macOS)和 GGML CUDA(Linux/NVIDIA);Gemma 4 在 Metal 上支持整模型融合 GPU decode(相对逐算子调度约提升 2.6 倍) +- **优化后的纯 C# CPU 后端** —— 为 GEMM、RMSNorm、RoPE、softmax、融合激活等推理热点路径提供托管快速路径和 SIMD 内核 +- **兼容 Ollama 与 OpenAI API** —— 可作为现有工具链的即插即用替代端点 +- **可配置采样** —— temperature、top-k、top-p、min-p、重复/存在/频率惩罚、seed、停止序列 +- **聊天模板** —— 从 GGUF 元数据自动加载(Jinja2),并为不同架构提供硬编码回退模板 +- **请求队列** —— FIFO 推理队列确保单请求执行以保障 KV 缓存稳定性,并为客户端提供实时排队位置反馈 +- **批处理** —— 控制台应用支持 JSONL 输入 +- **流式输出** —— 按 token 输出(Web 通过 SSE,控制台通过 stdout) +- **混合 SSM-Transformer** —— Nemotron-H 在单个模型中混合 Mamba2 SSM 层、纯注意力层和 MoE FFN 层 +- **专家混合(MoE)** —— 支持 Gemma 4 MoE 变体(例如 gemma-4-26B-A4B)、GPT OSS MoE(例如 gpt-oss-20b)、Nemotron-H MoE FFN 层 +- **消息编辑** —— 在 Web 聊天界面中编辑或删除历史消息,并从该位置重新生成回复 +- **大文件上传** —— Web 界面支持最大 500 MB 的视频/音频上传 + +## 支持的模型架构 + +| 架构 | 示例模型 | 多模态 | 思维链 | 工具调用 | +|---|---|---|---|---| +| Gemma 4 | gemma-4-E4B、gemma-4-31B、gemma-4-26B-A4B(MoE) | 图像、视频、音频 | 支持 | 支持 | +| Gemma 3 | gemma-3-4b | 图像 | 不支持 | 不支持 | +| Qwen 3 | Qwen3-4B | 仅文本 | 支持 | 支持 | +| Qwen 3.5 | Qwen3.5-9B、Qwen3.5-35B-A3B | 图像 | 支持 | 支持 | +| GPT OSS | gpt-oss-20b(MoE) | 仅文本 | 支持 | 不支持 | +| Nemotron-H | Nemotron-H-8B、Nemotron-H-47B(混合 SSM-Transformer,MoE) | 仅文本 | 支持 | 支持 | +| Mistral 3 | Mistral-Small-3.1-24B-Instruct | 图像 | 不支持 | 不支持 | + +各架构的详细文档见[模型架构卡片](docs/model_cards_cn.md)。 + +## 模型下载(GGUF) + +TensorSharp 使用 GGUF 格式模型文件。以下是各架构对应的 Hugging Face 下载链接。请根据硬件条件选择合适的量化版本(Q4_K_M 适合低内存,Q8_0 适合更高质量等)。 + +| 架构 | 模型 | GGUF 下载 | +|---|---|---| +| Gemma 4 | gemma-4-E4B-it | [ggml-org/gemma-4-E4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF) | +| Gemma 4 | gemma-4-31B-it | [ggml-org/gemma-4-31B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-31B-it-GGUF) | +| Gemma 4 | gemma-4-26B-A4B-it(MoE) | [ggml-org/gemma-4-26B-A4B-it-GGUF](https://huggingface.co/ggml-org/gemma-4-26B-A4B-it-GGUF) | +| Gemma 4 | gemma-4-mmproj(多模态投影器) | 包含在上述 GGUF 仓库中 | +| Gemma 3 | gemma-3-4b-it | [google/gemma-3-4b-it-qat-q4_0-gguf](https://huggingface.co/google/gemma-3-4b-it-qat-q4_0-gguf) | +| Qwen 3 | Qwen3-4B | [Qwen/Qwen3-4B-GGUF](https://huggingface.co/Qwen/Qwen3-4B-GGUF) | +| Qwen 3.5 | Qwen3.5-9B | [unsloth/Qwen3.5-9B-GGUF](https://huggingface.co/unsloth/Qwen3.5-9B-GGUF) | +| Qwen 3.5 | Qwen3.5-35B-A3B | [ggml-org/Qwen3.5-35B-A3B-GGUF](https://huggingface.co/ggml-org/Qwen3.5-35B-A3B-GGUF) | +| GPT OSS | gpt-oss-20b(MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) | +| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) | +| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) | +| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) | +| Mistral 3 | mistral3-mmproj(Pixtral 视觉投影器) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) | + +## 计算后端 + +| 后端 | 参数 | 说明 | +|---|---|---| +| GGML Metal | `--backend ggml_metal` | 通过 Apple Metal(macOS)进行 GPU 加速。推荐用于 Apple Silicon。 | +| GGML CUDA | `--backend ggml_cuda` | 通过 GGML CUDA 在 Linux + NVIDIA GPU 上进行加速。 | +| GGML CPU | `--backend ggml_cpu` | 使用原生 GGML 与优化内核进行 CPU 推理。 | +| 纯 C# CPU | `--backend cpu` | 无原生依赖的可移植 CPU 推理。 | + +## 项目结构 + +```text +TensorSharp/ +├── TensorSharp.Core/ # 核心张量库(Tensor、Ops、内存、设备抽象) +├── TensorSharp.Runtime/ # GGUF、分词器、模板、采样、协议解析 +├── TensorSharp.Models/ # 模型架构实现与多模态编码/注入 +├── TensorSharp.Backends.GGML/ # GGML 后端绑定(通过原生库支持 Metal/CUDA/CPU) +├── TensorSharp.GGML.Native/ # 到 ggml 的原生 C++ 桥接(构建 libGgmlOps) +├── TensorSharp.Server/ # Web 聊天 + API 服务(ASP.NET Core) +│ ├── ModelService.cs # 模型生命周期管理 +│ ├── InferenceQueue.cs # 带排队位置跟踪的 FIFO 请求队列 +│ ├── wwwroot/index.html # 聊天界面 +│ ├── testdata/ # 集成测试套件(bash + Python) +│ └── API_EXAMPLES.md # 详细 API 文档 +├── TensorSharp.Cli/ # CLI 应用 +├── AdvUtils/ # 工具库 +└── ExternalProjects/ # 第三方依赖(ggml) +``` + +## NuGet 包分层 + +现在仓库按包边界拆成独立层,使用者可以只引用真正需要的部分。 + +| 项目 | NuGet 包 | 对外 namespace | 职责 | +|---|---|---|---| +| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor 原语、Ops、分配器、存储与设备抽象 | +| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF 解析、分词器、Prompt 渲染、采样与输出协议解析 | +| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`、各模型架构、多模态编码器与模型侧执行辅助 | +| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML 执行后端与原生互操作 | +| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core 服务、OpenAI/Ollama 适配层、队列与 Web UI | +| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | 控制台宿主、调试工具与 JSONL 批处理 | + +这样的拆分让引擎使用者不必带上 Web 依赖,也能把 API 层改动和核心运行时隔离开,并让后续 benchmark / eval harness 更容易独立发布。 + +## 前置要求 + +- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0) +- **macOS(Metal 后端):** 用于构建原生 GGML 库的 CMake 3.20+ 与 Xcode 命令行工具 +- **Linux(GGML CPU / CUDA 后端):** CMake 3.20+;若使用 `ggml_cuda`,还需要 NVIDIA 驱动和 CUDA Toolkit 12.x 或其他兼容版本 +- GGUF 模型文件(例如来自 [Hugging Face](https://huggingface.co)) + +## 构建 + +### 构建整个解决方案 + +```bash +dotnet build TensorSharp.slnx +``` + +### 构建单独应用 + +```bash +# 控制台应用 +dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj + +# Web 应用 +dotnet build TensorSharp.Server/TensorSharp.Server.csproj +``` + +### 构建原生 GGML 库 + +如果原生库不存在,首次执行 `dotnet build` 时会自动构建。也可以手动构建: + +```bash +cd TensorSharp.GGML.Native +``` + +macOS: + +```bash +bash build-macos.sh +``` + +Linux(仅 CPU): + +```bash +bash build-linux.sh +``` + +Linux(启用 GGML_CUDA): + +```bash +bash build-linux.sh --cuda +``` + +也可以在 `dotnet build` 时通过环境变量请求 CUDA 版本的原生库: + +```bash +TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release +``` + +在 macOS 上会生成带 Metal GPU 支持的 `libGgmlOps.dylib`。在 Linux 上,`build-linux.sh` 会保留已有的 CUDA 构建,并在检测到 CUDA 工具链时自动启用 GGML_CUDA;也可以通过 `build-linux.sh --cuda` 或 `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` 显式启用。构建产物会自动复制到应用输出目录。 + +## 使用方法 + +### 控制台应用 + +```bash +cd TensorSharp.Cli/bin + +# 文本推理 +./TensorSharp.Cli --model --input prompt.txt --output result.txt \ + --max-tokens 200 --backend ggml_metal + +# Linux + NVIDIA GPU 文本推理 +./TensorSharp.Cli --model --input prompt.txt --output result.txt \ + --max-tokens 200 --backend ggml_cuda + +# 图像推理(Gemma 3/4,Qwen 3.5) +./TensorSharp.Cli --model --image photo.png --backend ggml_metal + +# 视频推理(Gemma 4) +./TensorSharp.Cli --model --video clip.mp4 --backend ggml_metal + +# 音频推理(Gemma 4) +./TensorSharp.Cli --model --audio speech.wav --backend ggml_metal + +# 思维链 / 推理模式 +./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal --think + +# 工具调用 +./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \ + --tools tools.json + +# 使用采样参数 +./TensorSharp.Cli --model --input prompt.txt --backend ggml_metal \ + --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42 + +# 批处理(JSONL) +./TensorSharp.Cli --model --input-jsonl requests.jsonl \ + --output results.txt --backend ggml_metal +``` + +**命令行参数:** + +| 参数 | 说明 | +|---|---| +| `--model ` | GGUF 模型文件路径(必填) | +| `--input ` | 包含用户提示词的文本文件 | +| `--input-jsonl ` | JSONL 批量请求文件(每行一个 JSON) | +| `--multi-turn-jsonl ` | 用于多轮对话模拟(含 KV 缓存复用)的 JSONL 文件 | +| `--output ` | 将生成文本写入该文件 | +| `--image ` | 用于视觉推理的图像文件 | +| `--video ` | 用于视频推理的视频文件 | +| `--audio ` | 音频文件(WAV、MP3、OGG)用于音频推理 | +| `--mmproj ` | 多模态投影器 GGUF 文件路径 | +| `--max-tokens ` | 最大生成 token 数(默认:100) | +| `--backend ` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda` | +| `--think` | 启用思维链/推理模式 | +| `--tools ` | 包含工具/函数定义的 JSON 文件 | +| `--temperature ` | 采样温度(0 = 贪心) | +| `--top-k ` | Top-K 过滤(0 = 关闭) | +| `--top-p ` | Nucleus 采样阈值(1.0 = 关闭) | +| `--min-p ` | 最小概率过滤(0 = 关闭) | +| `--repeat-penalty ` | 重复惩罚(1.0 = 无) | +| `--presence-penalty ` | 存在惩罚(0 = 关闭) | +| `--frequency-penalty ` | 频率惩罚(0 = 关闭) | +| `--seed ` | 随机种子(-1 = 非确定性) | +| `--stop ` | 停止序列(可重复指定) | +| `--test` | 运行内置测试套件 | + +如果把多模态投影器文件放在模型文件同目录并使用可识别命名(例如 `gemma-4-mmproj-F16.gguf`),系统会自动检测。 + +**JSONL 输入格式:** + +每行是一个 JSON 对象,包含 `messages`、可选 `prompt` 和可选采样参数: + +```json +{"id": "q1", "messages": [{"role": "user", "content": "What is 2+3?"}], "max_tokens": 50} +{"id": "q2", "messages": [{"role": "user", "content": "Write a haiku."}], "max_tokens": 100, "temperature": 0.8} +``` + +### Web 应用 + +```bash +cd TensorSharp.Server/bin + +# 设置环境变量并运行 +MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server + +# Linux + NVIDIA GPU +MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server +``` + +在浏览器中打开 `http://localhost:5000`。Web 界面支持: + +- 多轮聊天 +- 从 `MODEL_DIR` 中可用 GGUF 文件列表选择模型 +- 上传图像、视频和音频进行多模态推理(最大 500 MB) +- 思维链/推理模式切换 +- 带函数定义的工具调用 +- 通过 Server-Sent Events 进行流式 token 生成 +- 带实时排队位置反馈的请求队列 +- 消息编辑和删除,支持从对话中任意位置重新生成 + +**环境变量:** + +| 变量 | 说明 | +|---|---| +| `MODEL_DIR` | GGUF 模型文件所在目录 | +| `BACKEND` | 计算后端:`cpu`、`ggml_cpu`、`ggml_metal` 或 `ggml_cuda`(默认:macOS 为 `ggml_metal`,其他平台为 `ggml_cpu`) | +| `VIDEO_MAX_FRAMES` | 视频提示词中均匀抽取的视频帧上限(默认:`4`) | +| `PORT` | HTTP 端口(默认:`5000`) | + +### HTTP API + +TensorSharp.Server 暴露三种 API 风格。完整文档及 curl/Python 示例见 [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md)。 + +**兼容 Ollama 的 API:** + +```bash +# 列出模型 +curl http://localhost:5000/api/tags + +# 文本生成 +curl -X POST http://localhost:5000/api/generate \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3-4B-Q8_0.gguf", "prompt": "Hello!", "stream": false}' + +# 聊天 +curl -X POST http://localhost:5000/api/chat/ollama \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "stream": false}' + +# 启用思维链模式的聊天 +curl -X POST http://localhost:5000/api/chat/ollama \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "计算 17*23"}], "think": true, "stream": false}' + +# 带工具调用的聊天 +curl -X POST http://localhost:5000/api/chat/ollama \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "天气怎么样?"}], "tools": [{"function": {"name": "get_weather", "description": "获取当前天气", "parameters": {"properties": {"city": {"type": "string"}}, "required": ["city"]}}}], "stream": false}' +``` + +**兼容 OpenAI 的 API:** + +```bash +# Chat completions +curl -X POST http://localhost:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3-4B-Q8_0.gguf", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 50}' +``` + +**OpenAI Python SDK:** + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:5000/v1", api_key="not-needed") +response = client.chat.completions.create( + model="Qwen3-4B-Q8_0.gguf", + messages=[{"role": "user", "content": "What is 2+3?"}], + max_tokens=50 +) +print(response.choices[0].message.content) +``` + +**队列状态:** + +```bash +curl http://localhost:5000/api/queue/status +# {"busy":false,"pending_requests":0,"total_processed":42} +``` + +## 思维链 / 推理模式 + +支持思维链模式的模型(Qwen 3、Qwen 3.5、Gemma 4、GPT OSS、Nemotron-H)可以在生成最终答案之前产出结构化的思维链推理内容。思维内容与主要回复分开,客户端可选择显示或隐藏。 + +- **Qwen 3 / Qwen 3.5 / Nemotron-H:** 使用 `...` 标签 +- **Gemma 4:** 使用 `<|channel>thought\n...` 标签 +- **GPT OSS:** 使用 Harmony 格式,以 `<|channel|>analysis` 标记思维过程,以 `<|channel|>final` 标记最终回复 + +通过 `--think`(控制台)、`"think": true`(Ollama API)或 Web 界面中的思维链开关启用。 + +## 工具调用 / 函数调用 + +模型可以调用用户定义的工具并参与多轮工具调用对话。将工具定义为 JSON 格式,通过 `--tools`(控制台)或 API 中的 `tools` 参数传入。 + +各架构使用各自的工具调用格式: + +- **Qwen 3 / Qwen 3.5 / Nemotron-H:** `{"name": "...", "arguments": {...}}` +- **Gemma 4:** `<|tool_call>call:function_name{args}` + +输出解析器(`OutputParser.cs`)会自动从模型原始输出中提取工具调用,与架构无关。 + +## 多模态支持 + +### Gemma 4 + +Gemma 4 模型支持图像、视频和音频输入。将多模态投影器(`gemma-4-mmproj-F16.gguf`)放在与模型文件相同目录即可自动加载。 + +- **图像:** PNG、JPEG +- **视频:** MP4(使用 OpenCV 以 1 fps 抽取最多 8 帧) +- **音频:** WAV(16kHz 单声道)、MP3、OGG Vorbis + +### Gemma 3 / Qwen 3.5 + +这两类模型支持图像输入,并需要对应的多模态投影器文件。 + +### Mistral 3 + +Mistral 3 通过 Pixtral 视觉编码器支持图像输入。将多模态投影器(`mistral3-mmproj.gguf`)放在与模型文件相同目录即可自动加载。 + +- **图像:** PNG、JPEG + +## 架构说明 + +TensorSharp 采用分层系统结构: + +1. **TensorSharp.Core** 提供核心 `Tensor` 类型、存储抽象和可扩展的操作注册表(`Ops`)。CPU 实现使用 `System.Numerics.Vectors` 进行 SIMD 加速。 + +2. **TensorSharp.Runtime** 负责运行时契约与通用服务:GGUF 解析、分词(SentencePiece / BPE)、聊天模板渲染、可配置 token 采样、输出解析,以及 `IModelArchitecture`、`IPromptRenderer`、`IOutputProtocolParser`、`IMultimodalInjector`、`IKVCachePolicy`、`IBackendExecutionPlan` 等抽象。 + +3. **TensorSharp.Models** 实现 `ModelBase` 以及各具体模型架构和多模态辅助组件(Gemma 3/4、Qwen 3/3.5、GPT OSS、Nemotron-H、Mistral 3)。模型通过 `ModelBase.Create()` 加载,并依据 GGUF 元数据自动识别架构。 + +4. **TensorSharp.Backends.GGML** 通过原生 C++ 桥接库(`libGgmlOps`)注册同名操作的加速实现,并链接 [ggml](https://github.com/ggml-org/ggml)。在 macOS 上可提供 Metal GPU 计算,在 Linux 上可启用面向 NVIDIA GPU 的 GGML CUDA。操作包括原生量化 matmul(Q4_K_M、Q8_0 等),无需反量化到 FP32。 + +5. **TensorSharp.Server** 是 HTTP / 应用层,提供兼容 Ollama 与 OpenAI 的 REST API、浏览器聊天 UI、上传处理和 FIFO 推理队列。 + +6. **TensorSharp.Cli** 是控制台 / 应用层,用于本地 prompt 运行、多模态实验、prompt 检查和 JSONL 批处理。 + +### 性能优化 + +- **融合 GPU decode**(Gemma 4):在 Metal 上将所有 Transformer 层合并为单次 GGML 计算图调度,将每个 token 的 CPU-GPU 往返从数百次降低到一次。相较逐算子调度约提升 2.6 倍。 +- **融合权重投影**:Q/K/V 投影融合为单次 QKV matmul;gate 与 up 投影融合为单次 gate_up matmul。 +- **原生量化计算**:量化权重(Q4_K_M、Q6_K、Q8_0 等)直接参与 matmul,无需展开为 FP32,节省内存与带宽。 +- **优化后的纯 C# CPU 路径**:托管 GEMM 快速路径和连续 Float32 内核加速了 decode、softmax、RMSNorm、RoPE、融合激活等热点路径,同时在 CPU 加载时保持量化 GGUF 权重压缩状态。 +- **环形 KV 缓存**:滑动窗口注意力层使用固定大小环形缓冲区,使内存占用不随序列长度增长。 +- **高内存效率模型加载**:大张量直接流式加载到原生内存,避免中间托管内存分配。 + +## 测试 + +TensorSharp.Server 的集成测试位于 `TensorSharp.Server/testdata/`。测试覆盖所有三种 API 风格(Web UI SSE、Ollama、OpenAI)、多轮对话、思维链模式、工具调用、队列行为、并发请求和中断支持。 + +```bash +# 先启动 TensorSharp.Server,然后运行: +python3 TensorSharp.Server/testdata/test_multiturn.py +# 或 +bash TensorSharp.Server/testdata/test_multiturn.sh +``` + +完整测试矩阵见 [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md)。 + +## 作者 + +Zhongkai Fu + +## 许可证 + +详见 [LICENSE](LICENSE)。 + diff --git a/TensorSharp.GGML/GgmlAllocator.cs b/TensorSharp.Backends.GGML/GgmlAllocator.cs similarity index 100% rename from TensorSharp.GGML/GgmlAllocator.cs rename to TensorSharp.Backends.GGML/GgmlAllocator.cs diff --git a/TensorSharp.GGML/GgmlBasicOps.cs b/TensorSharp.Backends.GGML/GgmlBasicOps.cs similarity index 100% rename from TensorSharp.GGML/GgmlBasicOps.cs rename to TensorSharp.Backends.GGML/GgmlBasicOps.cs diff --git a/TensorSharp.GGML/GgmlContext.cs b/TensorSharp.Backends.GGML/GgmlContext.cs similarity index 100% rename from TensorSharp.GGML/GgmlContext.cs rename to TensorSharp.Backends.GGML/GgmlContext.cs diff --git a/TensorSharp.GGML/GgmlGgufTensorDequant.cs b/TensorSharp.Backends.GGML/GgmlGgufTensorDequant.cs similarity index 100% rename from TensorSharp.GGML/GgmlGgufTensorDequant.cs rename to TensorSharp.Backends.GGML/GgmlGgufTensorDequant.cs diff --git a/TensorSharp.GGML/GgmlLossOps.cs b/TensorSharp.Backends.GGML/GgmlLossOps.cs similarity index 100% rename from TensorSharp.GGML/GgmlLossOps.cs rename to TensorSharp.Backends.GGML/GgmlLossOps.cs diff --git a/TensorSharp.GGML/GgmlMemoryPool.cs b/TensorSharp.Backends.GGML/GgmlMemoryPool.cs similarity index 100% rename from TensorSharp.GGML/GgmlMemoryPool.cs rename to TensorSharp.Backends.GGML/GgmlMemoryPool.cs diff --git a/TensorSharp.GGML/GgmlNative.cs b/TensorSharp.Backends.GGML/GgmlNative.cs similarity index 100% rename from TensorSharp.GGML/GgmlNative.cs rename to TensorSharp.Backends.GGML/GgmlNative.cs diff --git a/TensorSharp.GGML/GgmlStorage.cs b/TensorSharp.Backends.GGML/GgmlStorage.cs similarity index 100% rename from TensorSharp.GGML/GgmlStorage.cs rename to TensorSharp.Backends.GGML/GgmlStorage.cs diff --git a/TensorSharp.GGML/TensorSharp.GGML.csproj b/TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj similarity index 92% rename from TensorSharp.GGML/TensorSharp.GGML.csproj rename to TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj index 2d93ad3..c272991 100644 --- a/TensorSharp.GGML/TensorSharp.GGML.csproj +++ b/TensorSharp.Backends.GGML/TensorSharp.Backends.GGML.csproj @@ -5,6 +5,8 @@ false false bin\ + GGML backend integration for TensorSharp model execution. + tensor;backend;ggml;native true @@ -27,12 +29,11 @@ true - - + diff --git a/TensorSharp.Cli/GlobalUsings.cs b/TensorSharp.Cli/GlobalUsings.cs new file mode 100644 index 0000000..df7cdd3 --- /dev/null +++ b/TensorSharp.Cli/GlobalUsings.cs @@ -0,0 +1,2 @@ +global using TensorSharp.Models; +global using TensorSharp.Runtime; diff --git a/InferenceConsole/Program.cs b/TensorSharp.Cli/Program.cs similarity index 92% rename from InferenceConsole/Program.cs rename to TensorSharp.Cli/Program.cs index 5737f51..35254b7 100644 --- a/InferenceConsole/Program.cs +++ b/TensorSharp.Cli/Program.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -14,14 +14,15 @@ using System.Linq; using System.Text; using System.Text.Json; -using InferenceEngine; using TensorSharp; using TensorSharp.Cpu; -namespace InferenceConsole +namespace TensorSharp.Cli { class Program { + private static readonly IPromptRenderer PromptRenderer = new GgufPromptRenderer(); + static void Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; @@ -112,7 +113,7 @@ static void Main(string[] args) if (modelPath == null || !File.Exists(modelPath)) { Console.Error.WriteLine($"Model file not found: {modelPath ?? "(none)"}"); - Console.Error.WriteLine("Usage: InferenceConsole --model [--input ] " + + Console.Error.WriteLine("Usage: TensorSharp.Cli --model [--input ] " + "[--input-jsonl ] [--image ] [--output ] " + "[--max-tokens N] [--test] [--backend cpu|ggml_cpu|ggml_metal|ggml_cuda]"); return; @@ -129,49 +130,48 @@ static void Main(string[] args) using var model = ModelBase.Create(modelPath, backend); - if (mmProjPath != null && model is Gemma3Model gemma3WithVision) - { - gemma3WithVision.LoadVisionEncoder(mmProjPath); - } - else if (mmProjPath != null && model is Gemma4Model gemma4WithVision) + if (mmProjPath != null) { - gemma4WithVision.LoadVisionEncoder(mmProjPath); - if (audioPath != null) - gemma4WithVision.LoadAudioEncoder(mmProjPath); - } - else if (mmProjPath != null && model is Qwen35Model qwen35WithVision) - { - qwen35WithVision.LoadVisionEncoder(mmProjPath); + model.MultimodalInjector.LoadProjectors(mmProjPath); } else if (imagePath != null && model.Config.Architecture == "gemma3") { string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "mmproj-gemma3-4b-f16.gguf"); - if (File.Exists(autoMmproj) && model is Gemma3Model g3auto) + if (File.Exists(autoMmproj)) { Console.WriteLine($"Auto-loading vision encoder: {autoMmproj}"); - g3auto.LoadVisionEncoder(autoMmproj); + model.MultimodalInjector.LoadProjectors(autoMmproj); + } + } + else if (imagePath != null && model.Config.Architecture == "mistral3") + { + string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "mistral3-mmproj.gguf"); + if (File.Exists(autoMmproj)) + { + Console.WriteLine($"Auto-loading Mistral3 vision encoder: {autoMmproj}"); + model.MultimodalInjector.LoadProjectors(autoMmproj); } } else if ((imagePath != null || audioPath != null || videoPath != null) && model.Config.Architecture == "gemma4") { string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "gemma-4-mmproj-F16.gguf"); - if (File.Exists(autoMmproj) && model is Gemma4Model g4auto) + if (File.Exists(autoMmproj)) { Console.WriteLine($"Auto-loading multimodal encoder: {autoMmproj}"); - if (imagePath != null || videoPath != null) - g4auto.LoadVisionEncoder(autoMmproj); - if (audioPath != null) - g4auto.LoadAudioEncoder(autoMmproj); + model.MultimodalInjector.LoadProjectors(autoMmproj); } } - else if (imagePath != null && model is Qwen35Model q35auto) + else if (imagePath != null && + (model.Config.Architecture == "qwen35" || + model.Config.Architecture == "qwen35moe" || + model.Config.Architecture == "qwen3next")) { string autoMmproj = Path.Combine(Path.GetDirectoryName(modelPath), "Qwen3.5-mmproj-F16.gguf"); if (File.Exists(autoMmproj)) { Console.WriteLine($"Auto-loading vision encoder: {autoMmproj}"); - q35auto.LoadVisionEncoder(autoMmproj); + model.MultimodalInjector.LoadProjectors(autoMmproj); } } @@ -249,7 +249,7 @@ static void Main(string[] args) { new ChatMessage { Role = "user", Content = rawText } }; - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = PromptRenderer.Render( model.Config.ChatTemplate, dumpMessages, addGenerationPrompt: true, architecture: model.Config.Architecture, tools: tools, enableThinking: enableThinking); Console.WriteLine("=== Rendered Prompt ==="); @@ -325,7 +325,7 @@ static void RunMultiTurnTest(ModelBase model, string jsonlPath, int maxTokens, history.Add(new ChatMessage { Role = "user", Content = userMsg }); Console.WriteLine($"\n[Turn {turn + 1}/{lines.Length}] User: {userMsg}"); - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = PromptRenderer.Render( model.Config.ChatTemplate, history, addGenerationPrompt: true, architecture: arch, enableThinking: enableThinking); @@ -485,7 +485,7 @@ static void RunJsonlBatch(ModelBase model, string inputJsonlPath, string outputF bool reqThinking = enableThinking || (root.TryGetProperty("enable_thinking", out var etProp) && etProp.GetBoolean()); - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = PromptRenderer.Render( model.Config.ChatTemplate, messages, addGenerationPrompt: true, architecture: model.Config.Architecture, enableThinking: reqThinking); @@ -655,7 +655,7 @@ static string RunInference(ModelBase model, string rawText, List imagePa new ChatMessage { Role = "user", Content = rawText, ImagePaths = imagePaths, AudioPaths = audioPaths, IsVideo = isVideo } }; - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = PromptRenderer.Render( model.Config.ChatTemplate, messages, addGenerationPrompt: true, architecture: model.Config.Architecture, tools: tools, enableThinking: enableThinking); @@ -790,6 +790,69 @@ static string RunInference(ModelBase model, string rawText, List imagePa Console.WriteLine("Note: No vision encoder loaded. Use --mmproj to specify the vision encoder GGUF."); } } + else if (arch == "mistral3") + { + if (model is Mistral3Model m3 && m3.VisionEncoder != null) + { + var proc = new Mistral3ImageProcessor( + m3.VisionEncoder.ImageSize, + m3.VisionEncoder.PatchSize); + + int imgTokenId = Mistral3ImageProcessor.ImgTokenId; + int imgBreakId = Mistral3ImageProcessor.ImgBreakTokenId; + int imgEndId = Mistral3ImageProcessor.ImgEndTokenId; + + foreach (var imgP in imagePaths) + { + var (pixels, imgW, imgH) = proc.ProcessImage(imgP); + var visionEmb = m3.VisionEncoder.Encode(pixels, imgW, imgH); + int numRows = imgH / m3.VisionEncoder.PatchSize / m3.VisionEncoder.SpatialMergeSize; + int numCols = imgW / m3.VisionEncoder.PatchSize / m3.VisionEncoder.SpatialMergeSize; + + int tokenPosition = -1; + for (int i = 0; i < inputTokens.Count; i++) + { + if (inputTokens[i] == imgTokenId) + { + tokenPosition = i; + break; + } + } + + if (tokenPosition >= 0) + { + var expanded = new List(); + for (int i = 0; i < tokenPosition; i++) + expanded.Add(inputTokens[i]); + + for (int row = 0; row < numRows; row++) + { + for (int col = 0; col < numCols; col++) + expanded.Add(imgTokenId); + expanded.Add(row == numRows - 1 ? imgEndId : imgBreakId); + } + + for (int i = tokenPosition + 1; i < inputTokens.Count; i++) + expanded.Add(inputTokens[i]); + + m3.SetVisionEmbeddings(visionEmb, tokenPosition); + inputTokens = expanded; + Console.WriteLine($"Mistral3 vision: {numRows}x{numCols} merged patches, " + + $"{numRows * numCols + numRows} total tokens at pos {tokenPosition}"); + } + else + { + visionEmb.Dispose(); + Console.WriteLine("Warning: No [IMG] token found in prompt"); + } + } + Console.WriteLine($"Total tokens after image expansion: {inputTokens.Count}"); + } + else + { + Console.WriteLine("Note: No vision encoder loaded. Use --mmproj to specify the vision encoder GGUF."); + } + } else { int imagePadId = model.Tokenizer.LookupToken("<|image_pad|>"); @@ -1382,3 +1445,6 @@ static string Escape(string s) } } } + + + diff --git a/InferenceConsole/InferenceConsole.csproj b/TensorSharp.Cli/TensorSharp.Cli.csproj similarity index 69% rename from InferenceConsole/InferenceConsole.csproj rename to TensorSharp.Cli/TensorSharp.Cli.csproj index 6884313..c876cf9 100644 --- a/InferenceConsole/InferenceConsole.csproj +++ b/TensorSharp.Cli/TensorSharp.Cli.csproj @@ -5,6 +5,7 @@ true false bin\ + Command-line host for TensorSharp model inference and diagnostics. $(MSBuildProjectDirectory)/../TensorSharp.GGML.Native/build @@ -12,7 +13,10 @@ libGgmlOps.so - + + + + diff --git a/InferenceConsole/test_requests.jsonl b/TensorSharp.Cli/test_requests.jsonl similarity index 100% rename from InferenceConsole/test_requests.jsonl rename to TensorSharp.Cli/test_requests.jsonl diff --git a/InferenceConsole/testdata/batch_thinking.jsonl b/TensorSharp.Cli/testdata/batch_thinking.jsonl similarity index 100% rename from InferenceConsole/testdata/batch_thinking.jsonl rename to TensorSharp.Cli/testdata/batch_thinking.jsonl diff --git a/InferenceConsole/testdata/example_api_thinking_tools.md b/TensorSharp.Cli/testdata/example_api_thinking_tools.md similarity index 96% rename from InferenceConsole/testdata/example_api_thinking_tools.md rename to TensorSharp.Cli/testdata/example_api_thinking_tools.md index 057a787..6f42bde 100644 --- a/InferenceConsole/testdata/example_api_thinking_tools.md +++ b/TensorSharp.Cli/testdata/example_api_thinking_tools.md @@ -1,4 +1,4 @@ -# Thinking Mode and Tool Call Examples +# Thinking Mode and Tool Call Examples ## Console Application @@ -8,11 +8,11 @@ Enable thinking mode with `--think`. The model will show its reasoning process b ```bash # Basic thinking mode -./InferenceConsole --model model.gguf --backend ggml_metal \ +./TensorSharp.Cli --model model.gguf --backend ggml_metal \ --input testdata/input_thinking.txt --think --max-tokens 500 # Thinking mode with sampling -./InferenceConsole --model model.gguf --backend ggml_metal \ +./TensorSharp.Cli --model model.gguf --backend ggml_metal \ --input testdata/input_thinking.txt --think --max-tokens 500 \ --temperature 0.6 --top-p 0.95 ``` @@ -23,17 +23,17 @@ Provide tool definitions via `--tools `. The model will output struct ```bash # Weather tool call -./InferenceConsole --model model.gguf --backend ggml_metal \ +./TensorSharp.Cli --model model.gguf --backend ggml_metal \ --input testdata/input_tool_call.txt \ --tools testdata/tools_weather.json --max-tokens 300 # Calculator tool call -./InferenceConsole --model model.gguf --backend ggml_metal \ +./TensorSharp.Cli --model model.gguf --backend ggml_metal \ --input testdata/input_tool_calc.txt \ --tools testdata/tools_calculator.json --max-tokens 300 # Combined: thinking + tools -./InferenceConsole --model model.gguf --backend ggml_metal \ +./TensorSharp.Cli --model model.gguf --backend ggml_metal \ --input testdata/input_tool_call.txt \ --tools testdata/tools_weather.json --think --max-tokens 500 ``` @@ -347,3 +347,4 @@ When `tools` are provided: 1. **Gemma4**: Tool declarations use `<|tool>declaration:NAME{...}` format in the system turn. The model outputs calls as `<|tool_call>call:NAME{key:<|"|>value<|"|>}`. 2. **Qwen3**: Tool definitions are injected as JSON in the system message. The model outputs calls as `{"name":"...","arguments":{...}}`. 3. **Qwen3.5**: Tool definitions use `...` format. The model outputs calls as `\nvalue\n`. + diff --git a/InferenceConsole/testdata/input_thinking.txt b/TensorSharp.Cli/testdata/input_thinking.txt similarity index 100% rename from InferenceConsole/testdata/input_thinking.txt rename to TensorSharp.Cli/testdata/input_thinking.txt diff --git a/InferenceConsole/testdata/input_tool_calc.txt b/TensorSharp.Cli/testdata/input_tool_calc.txt similarity index 100% rename from InferenceConsole/testdata/input_tool_calc.txt rename to TensorSharp.Cli/testdata/input_tool_calc.txt diff --git a/InferenceConsole/testdata/input_tool_call.txt b/TensorSharp.Cli/testdata/input_tool_call.txt similarity index 100% rename from InferenceConsole/testdata/input_tool_call.txt rename to TensorSharp.Cli/testdata/input_tool_call.txt diff --git a/InferenceConsole/testdata/tools_calculator.json b/TensorSharp.Cli/testdata/tools_calculator.json similarity index 100% rename from InferenceConsole/testdata/tools_calculator.json rename to TensorSharp.Cli/testdata/tools_calculator.json diff --git a/InferenceConsole/testdata/tools_weather.json b/TensorSharp.Cli/testdata/tools_weather.json similarity index 100% rename from InferenceConsole/testdata/tools_weather.json rename to TensorSharp.Cli/testdata/tools_weather.json diff --git a/TensorSharp/Core/DelegateDisposable.cs b/TensorSharp.Core/Core/DelegateDisposable.cs similarity index 100% rename from TensorSharp/Core/DelegateDisposable.cs rename to TensorSharp.Core/Core/DelegateDisposable.cs diff --git a/TensorSharp/Core/TensorConcatenation.cs b/TensorSharp.Core/Core/TensorConcatenation.cs similarity index 100% rename from TensorSharp/Core/TensorConcatenation.cs rename to TensorSharp.Core/Core/TensorConcatenation.cs diff --git a/TensorSharp/Core/TensorResultBuilder.cs b/TensorSharp.Core/Core/TensorResultBuilder.cs similarity index 100% rename from TensorSharp/Core/TensorResultBuilder.cs rename to TensorSharp.Core/Core/TensorResultBuilder.cs diff --git a/TensorSharp/Cpu/CpuAllocator.cs b/TensorSharp.Core/Cpu/CpuAllocator.cs similarity index 100% rename from TensorSharp/Cpu/CpuAllocator.cs rename to TensorSharp.Core/Cpu/CpuAllocator.cs diff --git a/TensorSharp/Cpu/CpuBasicOps.cs b/TensorSharp.Core/Cpu/CpuBasicOps.cs similarity index 100% rename from TensorSharp/Cpu/CpuBasicOps.cs rename to TensorSharp.Core/Cpu/CpuBasicOps.cs diff --git a/TensorSharp/Cpu/CpuFillCopyOps.cs b/TensorSharp.Core/Cpu/CpuFillCopyOps.cs similarity index 100% rename from TensorSharp/Cpu/CpuFillCopyOps.cs rename to TensorSharp.Core/Cpu/CpuFillCopyOps.cs diff --git a/TensorSharp/Cpu/CpuIndexingOps.cs b/TensorSharp.Core/Cpu/CpuIndexingOps.cs similarity index 100% rename from TensorSharp/Cpu/CpuIndexingOps.cs rename to TensorSharp.Core/Cpu/CpuIndexingOps.cs diff --git a/TensorSharp/Cpu/CpuMaxPoolingOps.cs b/TensorSharp.Core/Cpu/CpuMaxPoolingOps.cs similarity index 100% rename from TensorSharp/Cpu/CpuMaxPoolingOps.cs rename to TensorSharp.Core/Cpu/CpuMaxPoolingOps.cs diff --git a/TensorSharp/Cpu/CpuNativeHelpers.cs b/TensorSharp.Core/Cpu/CpuNativeHelpers.cs similarity index 100% rename from TensorSharp/Cpu/CpuNativeHelpers.cs rename to TensorSharp.Core/Cpu/CpuNativeHelpers.cs diff --git a/TensorSharp/Cpu/CpuOpsNative.cs b/TensorSharp.Core/Cpu/CpuOpsNative.cs similarity index 100% rename from TensorSharp/Cpu/CpuOpsNative.cs rename to TensorSharp.Core/Cpu/CpuOpsNative.cs diff --git a/TensorSharp/Cpu/CpuRandom.cs b/TensorSharp.Core/Cpu/CpuRandom.cs similarity index 100% rename from TensorSharp/Cpu/CpuRandom.cs rename to TensorSharp.Core/Cpu/CpuRandom.cs diff --git a/TensorSharp/Cpu/CpuStorage.cs b/TensorSharp.Core/Cpu/CpuStorage.cs similarity index 100% rename from TensorSharp/Cpu/CpuStorage.cs rename to TensorSharp.Core/Cpu/CpuStorage.cs diff --git a/TensorSharp/Cpu/LinearAlgebra/DGEMM.cs b/TensorSharp.Core/Cpu/LinearAlgebra/DGEMM.cs similarity index 100% rename from TensorSharp/Cpu/LinearAlgebra/DGEMM.cs rename to TensorSharp.Core/Cpu/LinearAlgebra/DGEMM.cs diff --git a/TensorSharp/Cpu/LinearAlgebra/LSAME.cs b/TensorSharp.Core/Cpu/LinearAlgebra/LSAME.cs similarity index 100% rename from TensorSharp/Cpu/LinearAlgebra/LSAME.cs rename to TensorSharp.Core/Cpu/LinearAlgebra/LSAME.cs diff --git a/TensorSharp/Cpu/LinearAlgebra/SGEMM.cs b/TensorSharp.Core/Cpu/LinearAlgebra/SGEMM.cs similarity index 100% rename from TensorSharp/Cpu/LinearAlgebra/SGEMM.cs rename to TensorSharp.Core/Cpu/LinearAlgebra/SGEMM.cs diff --git a/TensorSharp/Cpu/LinearAlgebra/XERBLA.cs b/TensorSharp.Core/Cpu/LinearAlgebra/XERBLA.cs similarity index 100% rename from TensorSharp/Cpu/LinearAlgebra/XERBLA.cs rename to TensorSharp.Core/Cpu/LinearAlgebra/XERBLA.cs diff --git a/TensorSharp/Cpu/MatrixMultiplication.cs b/TensorSharp.Core/Cpu/MatrixMultiplication.cs similarity index 100% rename from TensorSharp/Cpu/MatrixMultiplication.cs rename to TensorSharp.Core/Cpu/MatrixMultiplication.cs diff --git a/TensorSharp/Cpu/NativeWrapper.cs b/TensorSharp.Core/Cpu/NativeWrapper.cs similarity index 100% rename from TensorSharp/Cpu/NativeWrapper.cs rename to TensorSharp.Core/Cpu/NativeWrapper.cs diff --git a/TensorSharp/Cpu/OpenBlasNative.cs b/TensorSharp.Core/Cpu/OpenBlasNative.cs similarity index 100% rename from TensorSharp/Cpu/OpenBlasNative.cs rename to TensorSharp.Core/Cpu/OpenBlasNative.cs diff --git a/TensorSharp/Cpu/SpatialConvolutionMM.cs b/TensorSharp.Core/Cpu/SpatialConvolutionMM.cs similarity index 100% rename from TensorSharp/Cpu/SpatialConvolutionMM.cs rename to TensorSharp.Core/Cpu/SpatialConvolutionMM.cs diff --git a/TensorSharp/DType.cs b/TensorSharp.Core/DType.cs similarity index 100% rename from TensorSharp/DType.cs rename to TensorSharp.Core/DType.cs diff --git a/TensorSharp/Expression/SExpression.cs b/TensorSharp.Core/Expression/SExpression.cs similarity index 100% rename from TensorSharp/Expression/SExpression.cs rename to TensorSharp.Core/Expression/SExpression.cs diff --git a/TensorSharp/Expression/SVar.cs b/TensorSharp.Core/Expression/SVar.cs similarity index 100% rename from TensorSharp/Expression/SVar.cs rename to TensorSharp.Core/Expression/SVar.cs diff --git a/TensorSharp/Expression/TExpression.cs b/TensorSharp.Core/Expression/TExpression.cs similarity index 100% rename from TensorSharp/Expression/TExpression.cs rename to TensorSharp.Core/Expression/TExpression.cs diff --git a/TensorSharp/Expression/TVar.cs b/TensorSharp.Core/Expression/TVar.cs similarity index 100% rename from TensorSharp/Expression/TVar.cs rename to TensorSharp.Core/Expression/TVar.cs diff --git a/TensorSharp/Half.cs b/TensorSharp.Core/Half.cs similarity index 100% rename from TensorSharp/Half.cs rename to TensorSharp.Core/Half.cs diff --git a/TensorSharp/IAllocator.cs b/TensorSharp.Core/IAllocator.cs similarity index 100% rename from TensorSharp/IAllocator.cs rename to TensorSharp.Core/IAllocator.cs diff --git a/TensorSharp/IBasicOps.cs b/TensorSharp.Core/IBasicOps.cs similarity index 100% rename from TensorSharp/IBasicOps.cs rename to TensorSharp.Core/IBasicOps.cs diff --git a/TensorSharp/OpConstraint.cs b/TensorSharp.Core/OpConstraint.cs similarity index 100% rename from TensorSharp/OpConstraint.cs rename to TensorSharp.Core/OpConstraint.cs diff --git a/TensorSharp/OpRegistry.cs b/TensorSharp.Core/OpRegistry.cs similarity index 100% rename from TensorSharp/OpRegistry.cs rename to TensorSharp.Core/OpRegistry.cs diff --git a/TensorSharp/OpRegistryAttributes.cs b/TensorSharp.Core/OpRegistryAttributes.cs similarity index 100% rename from TensorSharp/OpRegistryAttributes.cs rename to TensorSharp.Core/OpRegistryAttributes.cs diff --git a/TensorSharp/Ops.cs b/TensorSharp.Core/Ops.cs similarity index 100% rename from TensorSharp/Ops.cs rename to TensorSharp.Core/Ops.cs diff --git a/TensorSharp/Properties/AssemblyInfo.cs b/TensorSharp.Core/Properties/AssemblyInfo.cs similarity index 100% rename from TensorSharp/Properties/AssemblyInfo.cs rename to TensorSharp.Core/Properties/AssemblyInfo.cs diff --git a/TensorSharp/Properties/PublishProfiles/FolderProfile.pubxml b/TensorSharp.Core/Properties/PublishProfiles/FolderProfile.pubxml similarity index 100% rename from TensorSharp/Properties/PublishProfiles/FolderProfile.pubxml rename to TensorSharp.Core/Properties/PublishProfiles/FolderProfile.pubxml diff --git a/TensorSharp/Properties/launchSettings.json b/TensorSharp.Core/Properties/launchSettings.json similarity index 100% rename from TensorSharp/Properties/launchSettings.json rename to TensorSharp.Core/Properties/launchSettings.json diff --git a/TensorSharp/RandomGenerator.cs b/TensorSharp.Core/RandomGenerator.cs similarity index 100% rename from TensorSharp/RandomGenerator.cs rename to TensorSharp.Core/RandomGenerator.cs diff --git a/TensorSharp/RefCounted.cs b/TensorSharp.Core/RefCounted.cs similarity index 100% rename from TensorSharp/RefCounted.cs rename to TensorSharp.Core/RefCounted.cs diff --git a/TensorSharp/ReflectionExtensions.cs b/TensorSharp.Core/ReflectionExtensions.cs similarity index 100% rename from TensorSharp/ReflectionExtensions.cs rename to TensorSharp.Core/ReflectionExtensions.cs diff --git a/TensorSharp/Storage.cs b/TensorSharp.Core/Storage.cs similarity index 100% rename from TensorSharp/Storage.cs rename to TensorSharp.Core/Storage.cs diff --git a/TensorSharp/Tensor.cs b/TensorSharp.Core/Tensor.cs similarity index 100% rename from TensorSharp/Tensor.cs rename to TensorSharp.Core/Tensor.cs diff --git a/TensorSharp/TensorApplyCPU.cs b/TensorSharp.Core/TensorApplyCPU.cs similarity index 100% rename from TensorSharp/TensorApplyCPU.cs rename to TensorSharp.Core/TensorApplyCPU.cs diff --git a/TensorSharp/TensorDimIterState.cs b/TensorSharp.Core/TensorDimIterState.cs similarity index 100% rename from TensorSharp/TensorDimIterState.cs rename to TensorSharp.Core/TensorDimIterState.cs diff --git a/TensorSharp/TensorDimensionHelpers.cs b/TensorSharp.Core/TensorDimensionHelpers.cs similarity index 100% rename from TensorSharp/TensorDimensionHelpers.cs rename to TensorSharp.Core/TensorDimensionHelpers.cs diff --git a/TensorSharp/TensorFormatting.cs b/TensorSharp.Core/TensorFormatting.cs similarity index 100% rename from TensorSharp/TensorFormatting.cs rename to TensorSharp.Core/TensorFormatting.cs diff --git a/TensorSharp/TensorIterState.cs b/TensorSharp.Core/TensorIterState.cs similarity index 100% rename from TensorSharp/TensorIterState.cs rename to TensorSharp.Core/TensorIterState.cs diff --git a/TensorSharp/TensorSerialization.cs b/TensorSharp.Core/TensorSerialization.cs similarity index 100% rename from TensorSharp/TensorSerialization.cs rename to TensorSharp.Core/TensorSerialization.cs diff --git a/TensorSharp/TensorSharp.csproj b/TensorSharp.Core/TensorSharp.Core.csproj similarity index 93% rename from TensorSharp/TensorSharp.csproj rename to TensorSharp.Core/TensorSharp.Core.csproj index 126e06e..e2bb2a3 100644 --- a/TensorSharp/TensorSharp.csproj +++ b/TensorSharp.Core/TensorSharp.Core.csproj @@ -5,6 +5,8 @@ false false bin\ + TensorSharp core tensor primitives, ops, memory management, and device abstractions. + tensor;core;ops;memory;device true @@ -54,4 +56,4 @@ - \ No newline at end of file + diff --git a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj index 36d8be5..ed8a773 100644 --- a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj +++ b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/InferenceEngineSmoke.csproj @@ -7,6 +7,7 @@ enable - + + diff --git a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs index 1962d59..9cfea69 100644 --- a/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs +++ b/TensorSharp.GGML.Native/tests/InferenceEngineSmoke/Program.cs @@ -1,6 +1,7 @@ using System; using System.IO; -using InferenceEngine; +using TensorSharp.Models; +using TensorSharp.Runtime; static BackendType ParseBackend(string backend) => backend.ToLowerInvariant() switch { diff --git a/TensorSharp.Models/BackendExecutionPlan.cs b/TensorSharp.Models/BackendExecutionPlan.cs new file mode 100644 index 0000000..4fdb90d --- /dev/null +++ b/TensorSharp.Models/BackendExecutionPlan.cs @@ -0,0 +1,32 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +namespace TensorSharp.Models +{ + internal sealed class BackendExecutionPlan : IBackendExecutionPlan + { + public BackendExecutionPlan(BackendType backendType) + { + BackendType = backendType; + } + + public BackendType BackendType { get; } + + public bool UsesGgmlBackend => + BackendType == BackendType.GgmlCpu || + BackendType == BackendType.GgmlMetal || + BackendType == BackendType.GgmlCuda; + + public bool ShouldStoreWeightQuantized(GgufTensorInfo info) + { + return ModelBase.ShouldStoreWeightQuantized(BackendType, info); + } + } +} + diff --git a/TensorSharp.Models/GlobalUsings.cs b/TensorSharp.Models/GlobalUsings.cs new file mode 100644 index 0000000..e0a9f20 --- /dev/null +++ b/TensorSharp.Models/GlobalUsings.cs @@ -0,0 +1 @@ +global using TensorSharp.Runtime; diff --git a/InferenceEngine/Half.cs b/TensorSharp.Models/Half.cs similarity index 96% rename from InferenceEngine/Half.cs rename to TensorSharp.Models/Half.cs index 395e4b7..2ac2393 100644 --- a/InferenceEngine/Half.cs +++ b/TensorSharp.Models/Half.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -10,7 +10,7 @@ using System; using System.Runtime.InteropServices; -namespace InferenceEngine +namespace TensorSharp.Models { [StructLayout(LayoutKind.Sequential)] public struct half @@ -71,3 +71,4 @@ private static float HalfToFloat(ushort value) } } } + diff --git a/InferenceEngine/ManagedQuantizedOps.cs b/TensorSharp.Models/ManagedQuantizedOps.cs similarity index 99% rename from InferenceEngine/ManagedQuantizedOps.cs rename to TensorSharp.Models/ManagedQuantizedOps.cs index 751044f..70ca137 100644 --- a/InferenceEngine/ManagedQuantizedOps.cs +++ b/TensorSharp.Models/ManagedQuantizedOps.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -12,7 +12,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -namespace InferenceEngine +namespace TensorSharp.Models { internal static class ManagedQuantizedOps { @@ -696,3 +696,4 @@ private static unsafe void GetScaleMinK4(int j, byte* q, out byte d, out byte m) } } } + diff --git a/InferenceEngine/MediaHelper.cs b/TensorSharp.Models/MediaHelper.cs similarity index 98% rename from InferenceEngine/MediaHelper.cs rename to TensorSharp.Models/MediaHelper.cs index 34e899a..3b3b6e0 100644 --- a/InferenceEngine/MediaHelper.cs +++ b/TensorSharp.Models/MediaHelper.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -14,7 +14,7 @@ using System.Runtime.InteropServices; using OpenCvSharp; -namespace InferenceEngine +namespace TensorSharp.Models { public static class MediaHelper { @@ -223,3 +223,4 @@ private static uint Crc32Png(byte[] type, byte[] data) } } } + diff --git a/InferenceEngine/ModelBase.cs b/TensorSharp.Models/ModelBase.cs similarity index 96% rename from InferenceEngine/ModelBase.cs rename to TensorSharp.Models/ModelBase.cs index cb9ba9a..c0e62ff 100644 --- a/InferenceEngine/ModelBase.cs +++ b/TensorSharp.Models/ModelBase.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -19,40 +19,8 @@ using TensorSharp.Cpu; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { - public enum BackendType - { - Cpu, - GgmlCpu, - GgmlMetal, - GgmlCuda, - } - - public class ModelConfig - { - public string Architecture { get; set; } - public int HiddenSize { get; set; } - public int NumHeads { get; set; } - public int NumKVHeads { get; set; } - public int KeyLength { get; set; } - public int ValueLength { get; set; } - public float Eps { get; set; } - public float RopeBase { get; set; } - public float RopeScale { get; set; } = 1f; - public int NumLayers { get; set; } - public int VocabSize { get; set; } - public int IntermediateSize { get; set; } - public string ChatTemplate { get; set; } - - public int NumExperts { get; set; } - public int NumExpertsUsed { get; set; } - public int SlidingWindow { get; set; } - public int OriginalContextLength { get; set; } - - public int HeadDim => KeyLength > 0 ? KeyLength : (ValueLength > 0 ? ValueLength : HiddenSize / NumHeads); - } - public class QuantizedWeight : IDisposable { public IntPtr Data { get; } @@ -109,10 +77,13 @@ public static unsafe void FreeBuffer(IntPtr ptr) } } - public abstract class ModelBase : IDisposable + public abstract class ModelBase : IModelArchitecture { public ModelConfig Config { get; protected set; } public ITokenizer Tokenizer { get; protected set; } + public IKVCachePolicy KVCachePolicy { get; } = DefaultKvCachePolicy.Shared; + public IMultimodalInjector MultimodalInjector { get; } + public IBackendExecutionPlan ExecutionPlan { get; } protected readonly GgufFile _gguf; private readonly GgmlContext _ggmlContext; @@ -124,8 +95,12 @@ public abstract class ModelBase : IDisposable private bool _quantBackendReady; protected int _cacheSeqLen; + protected int _maxContextLength; protected float[] _logitsBuffer; + public int MaxContextLength => _maxContextLength; + public int CacheSeqLen => _cacheSeqLen; + // Timing protected long _linearTicks; protected long _attnTicks; @@ -137,6 +112,8 @@ public abstract class ModelBase : IDisposable protected ModelBase(string ggufPath, BackendType backend) { _backend = backend; + ExecutionPlan = new BackendExecutionPlan(backend); + MultimodalInjector = new ModelMultimodalInjector(this); switch (backend) { case BackendType.GgmlCpu: @@ -162,9 +139,7 @@ protected ModelBase(string ggufPath, BackendType backend) _gguf = new GgufFile(ggufPath); } - protected bool IsGgmlBackend => _backend == BackendType.GgmlCpu || - _backend == BackendType.GgmlMetal || - _backend == BackendType.GgmlCuda; + protected bool IsGgmlBackend => ExecutionPlan.UsesGgmlBackend; protected void EnsureQuantBackendAvailable() { @@ -240,7 +215,7 @@ protected void ParseTokenizer() protected virtual bool IsQuantizedLinearWeight(GgufTensorInfo info) { - return ShouldStoreWeightQuantized(_backend, info); + return ExecutionPlan.ShouldStoreWeightQuantized(info); } internal static bool ShouldStoreWeightQuantized(BackendType backend, GgufTensorInfo info) @@ -970,8 +945,10 @@ public static ModelBase Create(string ggufPath, BackendType backend) "gemma4" => new Gemma4Model(ggufPath, backend), "gptoss" or "gpt-oss" => new GptOssModel(ggufPath, backend), "nemotron_h" or "nemotron_h_moe" => new NemotronModel(ggufPath, backend), + "mistral3" => new Mistral3Model(ggufPath, backend), _ => throw new NotSupportedException($"Unsupported architecture: {arch}"), }; } } } + diff --git a/TensorSharp.Models/ModelMultimodalInjector.cs b/TensorSharp.Models/ModelMultimodalInjector.cs new file mode 100644 index 0000000..6f12f46 --- /dev/null +++ b/TensorSharp.Models/ModelMultimodalInjector.cs @@ -0,0 +1,345 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System; +using System.Collections.Generic; +using TensorSharp; + +namespace TensorSharp.Models +{ + internal sealed class ModelMultimodalInjector : IMultimodalInjector + { + private readonly ModelBase _model; + + public ModelMultimodalInjector(ModelBase model) + { + _model = model; + } + + public void LoadProjectors(string mmProjPath) + { + if (string.IsNullOrWhiteSpace(mmProjPath)) + return; + + switch (_model) + { + case Gemma4Model g4: + g4.LoadVisionEncoder(mmProjPath); + g4.LoadAudioEncoder(mmProjPath); + break; + case Gemma3Model g3: + g3.LoadVisionEncoder(mmProjPath); + break; + case Qwen35Model q35: + q35.LoadVisionEncoder(mmProjPath); + break; + case Mistral3Model m3: + m3.LoadVisionEncoder(mmProjPath); + break; + } + } + + public List ProcessPromptTokens(List history, List inputTokens) + { + if (history == null || history.Count == 0 || inputTokens == null || inputTokens.Count == 0) + return inputTokens; + + if (_model is Gemma4Model g4) + return ProcessGemma4History(g4, history, inputTokens); + if (_model is Gemma3Model g3) + return ProcessGemma3History(g3, history, inputTokens); + if (_model is Qwen35Model q35) + return ProcessQwen35History(q35, history, inputTokens); + if (_model is Mistral3Model m3) + return ProcessMistral3History(m3, history, inputTokens); + + return inputTokens; + } + + private List ProcessGemma4History(Gemma4Model model, List history, List inputTokens) + { + int imageStartId = _model.Tokenizer.LookupToken("<|image>"); + int imageEndId = _model.Tokenizer.LookupToken(""); + if (imageStartId < 0) imageStartId = 255999; + if (imageEndId < 0) imageEndId = 256000; + + int audioStartId = _model.Tokenizer.LookupToken("<|audio>"); + int audioEndId = _model.Tokenizer.LookupToken(""); + + var imageProcessor = model.VisionEncoder != null ? new Gemma4ImageProcessor() : null; + int searchFrom = 0; + + foreach (var message in history) + { + if (message.ImagePaths != null && model.VisionEncoder != null) + { + foreach (var imagePath in message.ImagePaths) + { + var (pixels, imageWidth, imageHeight) = imageProcessor.ProcessImage(imagePath); + var embeddings = model.VisionEncoder.Encode(pixels, imageWidth, imageHeight); + int tokenCount = (int)embeddings.Sizes[0]; + int tokenPosition = FindTokenPosition(inputTokens, imageStartId, searchFrom); + + if (tokenPosition >= 0) + { + inputTokens = ExpandSingleTokenPlaceholder(inputTokens, tokenPosition, imageStartId, tokenCount, imageEndId); + model.SetVisionEmbeddings(embeddings, tokenPosition + 1); + searchFrom = tokenPosition + tokenCount + 2; + } + else + { + embeddings.Dispose(); + } + } + } + + if (message.AudioPaths != null && model.AudioEncoder != null && audioStartId >= 0 && audioEndId >= 0) + { + foreach (var audioPath in message.AudioPaths) + { + float[] samples = Gemma4AudioPreprocessor.DecodeAudioFile(audioPath); + if (samples.Length % 128 != 0) + { + int padded = samples.Length + (128 - samples.Length % 128); + Array.Resize(ref samples, padded); + } + + var (melData, numFrames) = Gemma4AudioPreprocessor.ComputeMelSpectrogram(samples); + if (melData == null || numFrames == 0) + continue; + + var embeddings = model.AudioEncoder.Encode(melData, numFrames); + int tokenCount = (int)embeddings.Sizes[0]; + int tokenPosition = FindTokenPosition(inputTokens, audioStartId, searchFrom); + + if (tokenPosition >= 0) + { + inputTokens = ExpandSingleTokenPlaceholder(inputTokens, tokenPosition, audioStartId, tokenCount, audioEndId); + model.SetAudioEmbeddings(embeddings, tokenPosition + 1); + searchFrom = tokenPosition + tokenCount + 2; + } + else + { + embeddings.Dispose(); + } + } + } + } + + return inputTokens; + } + + private List ProcessGemma3History(Gemma3Model model, List history, List inputTokens) + { + if (model.VisionEncoder == null) + return inputTokens; + + var imagePaths = GetImagePathsInPromptOrder(history); + if (imagePaths.Count == 0) + return inputTokens; + + var processor = new Gemma3ImageProcessor(); + int startId = _model.Tokenizer.LookupToken(""); + if (startId < 0) startId = Gemma3ImageProcessor.StartOfImageToken; + int endId = Gemma3ImageProcessor.EndOfImageToken; + int newlineId = Gemma3ImageProcessor.NewlineNewlineToken; + int padId = Gemma3ImageProcessor.PadToken; + + inputTokens = ChatTemplate.ExpandGemma3ImageTokens( + inputTokens, + startId, + endId, + newlineId, + padId, + processor.TokensPerImage); + + int searchFrom = 0; + foreach (var imagePath in imagePaths) + { + float[] pixels = processor.ProcessImage(imagePath); + var embeddings = model.VisionEncoder.Encode(pixels); + int tokenStart = FindGemma3ImageInsertPosition(inputTokens, startId, padId, searchFrom); + + if (tokenStart >= 0) + { + model.SetVisionEmbeddings(embeddings, tokenStart); + searchFrom = tokenStart + processor.TokensPerImage + 2; + } + else + { + embeddings.Dispose(); + } + } + + return inputTokens; + } + + private List ProcessQwen35History(Qwen35Model model, List history, List inputTokens) + { + if (model.VisionEncoder == null) + return inputTokens; + + var imagePaths = GetImagePathsInPromptOrder(history); + if (imagePaths.Count == 0) + return inputTokens; + + int imagePadId = _model.Tokenizer.LookupToken("<|image_pad|>"); + if (imagePadId < 0) + return inputTokens; + + var processor = new Qwen35ImageProcessor(model.VisionEncoder.PatchSize, model.VisionEncoder.SpatialMergeSize); + var tokenCounts = new int[imagePaths.Count]; + for (int i = 0; i < imagePaths.Count; i++) + { + var (width, height) = Qwen35ImageProcessor.ReadImageDimensions(imagePaths[i]); + tokenCounts[i] = processor.ComputeImageTokenCount(height, width); + } + + inputTokens = ChatTemplate.ExpandImageTokens(inputTokens, imagePadId, tokenCounts); + + int searchFrom = 0; + for (int i = 0; i < imagePaths.Count; i++) + { + var (pixels, resizedHeight, resizedWidth) = processor.ProcessImage(imagePaths[i]); + var embeddings = model.VisionEncoder.Encode(pixels, resizedHeight, resizedWidth); + int tokenStart = FindTokenPosition(inputTokens, imagePadId, searchFrom); + + if (tokenStart >= 0) + { + model.SetVisionEmbeddings(embeddings, tokenStart); + searchFrom = tokenStart + tokenCounts[i]; + } + else + { + embeddings.Dispose(); + } + } + + return inputTokens; + } + + private List ProcessMistral3History(Mistral3Model model, List history, List inputTokens) + { + if (model.VisionEncoder == null) + return inputTokens; + + var imagePaths = GetImagePathsInPromptOrder(history); + if (imagePaths.Count == 0) + return inputTokens; + + var processor = new Mistral3ImageProcessor( + model.VisionEncoder.ImageSize, + model.VisionEncoder.PatchSize); + + int searchFrom = 0; + foreach (var imagePath in imagePaths) + { + var (pixels, imgW, imgH) = processor.ProcessImage(imagePath); + var embeddings = model.VisionEncoder.Encode(pixels, imgW, imgH); + int numRows = imgH / model.VisionEncoder.PatchSize / model.VisionEncoder.SpatialMergeSize; + int numCols = imgW / model.VisionEncoder.PatchSize / model.VisionEncoder.SpatialMergeSize; + + // Find [IMG] token position + int tokenPosition = FindTokenPosition(inputTokens, Mistral3ImageProcessor.ImgTokenId, searchFrom); + if (tokenPosition < 0) + { + embeddings.Dispose(); + continue; + } + + // Expand: for each row, insert numCols [IMG] tokens, then [IMG_BREAK] or [IMG_END] + var expanded = new List(); + for (int i = 0; i < tokenPosition; i++) + expanded.Add(inputTokens[i]); + + int embedOffset = 0; + for (int row = 0; row < numRows; row++) + { + for (int col = 0; col < numCols; col++) + { + expanded.Add(Mistral3ImageProcessor.ImgTokenId); + embedOffset++; + } + if (row == numRows - 1) + expanded.Add(Mistral3ImageProcessor.ImgEndTokenId); + else + expanded.Add(Mistral3ImageProcessor.ImgBreakTokenId); + } + + for (int i = tokenPosition + 1; i < inputTokens.Count; i++) + expanded.Add(inputTokens[i]); + + model.SetVisionEmbeddings(embeddings, tokenPosition); + inputTokens = expanded; + searchFrom = tokenPosition + numRows * numCols + numRows; + } + + return inputTokens; + } + + private static List GetImagePathsInPromptOrder(List history) + { + var imagePaths = new List(); + if (history == null) + return imagePaths; + + foreach (var message in history) + { + if (message.ImagePaths == null) + continue; + + foreach (var path in message.ImagePaths) + { + if (!string.IsNullOrEmpty(path)) + imagePaths.Add(path); + } + } + + return imagePaths; + } + + private static List ExpandSingleTokenPlaceholder( + List inputTokens, int tokenPosition, int startTokenId, int expandedTokenCount, int endTokenId) + { + var expanded = new List(inputTokens.Count + expandedTokenCount + 1); + for (int i = 0; i < tokenPosition; i++) + expanded.Add(inputTokens[i]); + expanded.Add(startTokenId); + for (int i = 0; i < expandedTokenCount; i++) + expanded.Add(0); + expanded.Add(endTokenId); + for (int i = tokenPosition + 1; i < inputTokens.Count; i++) + expanded.Add(inputTokens[i]); + return expanded; + } + + private static int FindTokenPosition(List tokens, int tokenId, int searchFrom) + { + for (int i = Math.Max(0, searchFrom); i < tokens.Count; i++) + { + if (tokens[i] == tokenId) + return i; + } + + return -1; + } + + private static int FindGemma3ImageInsertPosition(List tokens, int startTokenId, int padTokenId, int searchFrom) + { + for (int i = Math.Max(0, searchFrom); i + 1 < tokens.Count; i++) + { + if (tokens[i] == startTokenId && tokens[i + 1] == padTokenId) + return i + 1; + } + + return -1; + } + } +} + diff --git a/InferenceEngine/Modelfile.fp16 b/TensorSharp.Models/Modelfile.fp16 similarity index 100% rename from InferenceEngine/Modelfile.fp16 rename to TensorSharp.Models/Modelfile.fp16 diff --git a/InferenceEngine/Models/Gemma3/Gemma3Model.cs b/TensorSharp.Models/Models/Gemma3/Gemma3Model.cs similarity index 99% rename from InferenceEngine/Models/Gemma3/Gemma3Model.cs rename to TensorSharp.Models/Models/Gemma3/Gemma3Model.cs index 1d73df4..4077572 100644 --- a/InferenceEngine/Models/Gemma3/Gemma3Model.cs +++ b/TensorSharp.Models/Models/Gemma3/Gemma3Model.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using TensorSharp; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { /// /// Gemma 3 model architecture. @@ -111,6 +111,7 @@ private void PrecomputeRoPE() private void InitKVCache(int maxSeqLen) { + _maxContextLength = maxSeqLen; _kvCacheK = new Tensor[Config.NumLayers]; _kvCacheV = new Tensor[Config.NumLayers]; @@ -669,3 +670,4 @@ public override void Dispose() } } } + diff --git a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs b/TensorSharp.Models/Models/Gemma3/Gemma3VisionEncoder.cs similarity index 99% rename from InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs rename to TensorSharp.Models/Models/Gemma3/Gemma3VisionEncoder.cs index 389744b..8785950 100644 --- a/InferenceEngine/Models/Gemma3/Gemma3VisionEncoder.cs +++ b/TensorSharp.Models/Models/Gemma3/Gemma3VisionEncoder.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using TensorSharp.Cpu; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { public class Gemma3VisionEncoder : IDisposable { @@ -416,3 +416,4 @@ public void Dispose() } } } + diff --git a/InferenceEngine/Models/Gemma3/ImageProcessor.cs b/TensorSharp.Models/Models/Gemma3/ImageProcessor.cs similarity index 99% rename from InferenceEngine/Models/Gemma3/ImageProcessor.cs rename to TensorSharp.Models/Models/Gemma3/ImageProcessor.cs index bfea928..1cc0243 100644 --- a/InferenceEngine/Models/Gemma3/ImageProcessor.cs +++ b/TensorSharp.Models/Models/Gemma3/ImageProcessor.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -12,7 +12,7 @@ using System.Threading.Tasks; using StbImageSharp; -namespace InferenceEngine +namespace TensorSharp.Models { public class Gemma3ImageProcessor { @@ -391,3 +391,4 @@ private float[] PackChannelFirst(byte[] rgba, int width, int height) } } } + diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs b/TensorSharp.Models/Models/Gemma4/Gemma4AudioEncoder.cs similarity index 99% rename from InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs rename to TensorSharp.Models/Models/Gemma4/Gemma4AudioEncoder.cs index 51e5280..f5b9435 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4AudioEncoder.cs +++ b/TensorSharp.Models/Models/Gemma4/Gemma4AudioEncoder.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using TensorSharp.Cpu; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { public class Gemma4AudioEncoder : IDisposable { @@ -892,3 +892,4 @@ public void Dispose() } } } + diff --git a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs b/TensorSharp.Models/Models/Gemma4/Gemma4AudioPreprocessor.cs similarity index 99% rename from InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs rename to TensorSharp.Models/Models/Gemma4/Gemma4AudioPreprocessor.cs index 325d53e..661232f 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4AudioPreprocessor.cs +++ b/TensorSharp.Models/Models/Gemma4/Gemma4AudioPreprocessor.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -14,7 +14,7 @@ using NLayer; using NVorbis; -namespace InferenceEngine +namespace TensorSharp.Models { public class Gemma4AudioPreprocessor { @@ -363,3 +363,4 @@ private static double[] BuildWindow() } } } + diff --git a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs b/TensorSharp.Models/Models/Gemma4/Gemma4ImageProcessor.cs similarity index 97% rename from InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs rename to TensorSharp.Models/Models/Gemma4/Gemma4ImageProcessor.cs index 840abda..a3d759c 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4ImageProcessor.cs +++ b/TensorSharp.Models/Models/Gemma4/Gemma4ImageProcessor.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -10,7 +10,7 @@ using System; using System.IO; -namespace InferenceEngine +namespace TensorSharp.Models { public class Gemma4ImageProcessor { @@ -96,3 +96,4 @@ public int ComputeOutputTokens(int imageWidth, int imageHeight) } } } + diff --git a/InferenceEngine/Models/Gemma4/Gemma4Model.cs b/TensorSharp.Models/Models/Gemma4/Gemma4Model.cs similarity index 97% rename from InferenceEngine/Models/Gemma4/Gemma4Model.cs rename to TensorSharp.Models/Models/Gemma4/Gemma4Model.cs index 90c5b10..b88d477 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4Model.cs +++ b/TensorSharp.Models/Models/Gemma4/Gemma4Model.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -14,7 +14,7 @@ using TensorSharp; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { /// /// Gemma 4 model architecture. @@ -320,6 +320,7 @@ private void PrecomputeRoPE() private void InitKVCache(int maxSeqLen) { + _maxContextLength = maxSeqLen; _kvCacheK = new Tensor[Config.NumLayers]; _kvCacheV = new Tensor[Config.NumLayers]; _kvCacheSize = new int[Config.NumLayers]; @@ -400,10 +401,16 @@ public override float[] Forward(int[] tokens) ScaleEmbedding(hidden); + HashSet exceptPositions = null; + if (_pendingVisionEmbeddingsList.Count > 0) { + exceptPositions = new HashSet(); foreach (var (emb, pos) in _pendingVisionEmbeddingsList) { + int numTokens = (int)emb.Sizes[0]; + for (int i = 0; i < numTokens; i++) + exceptPositions.Add(pos + i); InjectVisionEmbeddings(hidden, emb, pos); emb.Dispose(); } @@ -412,8 +419,12 @@ public override float[] Forward(int[] tokens) if (_pendingAudioEmbeddingsList.Count > 0) { + exceptPositions ??= new HashSet(); foreach (var (emb, pos) in _pendingAudioEmbeddingsList) { + int numTokens = (int)emb.Sizes[0]; + for (int i = 0; i < numTokens; i++) + exceptPositions.Add(pos + i); InjectVisionEmbeddings(hidden, emb, pos); emb.Dispose(); } @@ -442,7 +453,7 @@ public override float[] Forward(int[] tokens) perLayerInput = ExtractPerLayerSlice(perLayerInputs, l, seqLen); bool isShared = _kvDonorMap.ContainsKey(l); - hidden = TransformerBlock(hidden, l, seqLen, startPos, isShared, perLayerInput); + hidden = TransformerBlock(hidden, l, seqLen, startPos, isShared, perLayerInput, exceptPositions); perLayerInput?.Dispose(); } @@ -901,13 +912,13 @@ private bool HasMoE(int layer) } private Tensor TransformerBlock(Tensor hidden, int layer, int seqLen, int startPos, - bool isShared, Tensor perLayerInput) + bool isShared, Tensor perLayerInput, HashSet exceptPositions = null) { string prefix = $"blk.{layer}"; using var attnNormed = RMSNormOp(hidden, $"{prefix}.attn_norm.weight"); - using var attnOut = Attention(attnNormed, layer, prefix, seqLen, startPos, isShared); + using var attnOut = Attention(attnNormed, layer, prefix, seqLen, startPos, isShared, exceptPositions); using var postAttnNormed = RMSNormOp(attnOut, $"{prefix}.post_attention_norm.weight"); @@ -1305,7 +1316,7 @@ private Tensor FFNGelu(Tensor input, string gateUpWeightName, string downWeightN #region Attention - private Tensor Attention(Tensor input, int layer, string prefix, int seqLen, int startPos, bool isShared) + private Tensor Attention(Tensor input, int layer, string prefix, int seqLen, int startPos, bool isShared, HashSet exceptPositions = null) { long t0 = Stopwatch.GetTimestamp(); bool isLocal = IsLocalLayer(layer); @@ -1504,7 +1515,8 @@ private Tensor Attention(Tensor input, int layer, string prefix, int seqLen, int kExpanded.Dispose(); int windowSize = isLocal ? _slidingWindow : 0; - ApplyCausalMask(scores, seqLen, kvLen, windowSize); + HashSet maskExcept = isLocal ? null : exceptPositions; + ApplyCausalMask(scores, seqLen, kvLen, windowSize, maskExcept); Ops.Softmax(scores, scores); var attnOut = new Tensor(_allocator, DType.Float32, Config.NumHeads, seqLen, hd); @@ -1756,10 +1768,38 @@ private unsafe void CopyToCacheCircular(Tensor cache, Tensor src, int startPos, InvalidateTensorDeviceCache(cache); } - private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, int windowSize) + private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, int windowSize, + HashSet exceptPositions = null) { int startPos = totalKVLen - queryLen; - Ops.AddCausalMask(scores, queryLen, startPos, float.NegativeInfinity); + + if (exceptPositions != null && exceptPositions.Count > 0) + { + float* sPtr = GetFloatPtr(scores); + int numHeads = (int)scores.Sizes[0]; + int rowStride = queryLen * totalKVLen; + + for (int h = 0; h < numHeads; h++) + { + float* headScores = sPtr + h * rowStride; + for (int q = 0; q < queryLen; q++) + { + int queryAbsPos = startPos + q; + bool queryIsExcept = exceptPositions.Contains(queryAbsPos); + float* row = headScores + q * totalKVLen; + for (int kv = queryAbsPos + 1; kv < totalKVLen; kv++) + { + if (!queryIsExcept && !exceptPositions.Contains(kv)) + row[kv] = float.NegativeInfinity; + } + } + } + InvalidateTensorDeviceCache(scores); + } + else + { + Ops.AddCausalMask(scores, queryLen, startPos, float.NegativeInfinity); + } if (windowSize > 0) { @@ -1782,6 +1822,7 @@ private unsafe void ApplyCausalMask(Tensor scores, int queryLen, int totalKVLen, } } } + InvalidateTensorDeviceCache(scores); } } @@ -1814,3 +1855,4 @@ public override void Dispose() } } } + diff --git a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs b/TensorSharp.Models/Models/Gemma4/Gemma4VisionEncoder.cs similarity index 97% rename from InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs rename to TensorSharp.Models/Models/Gemma4/Gemma4VisionEncoder.cs index 304529f..4e7e88d 100644 --- a/InferenceEngine/Models/Gemma4/Gemma4VisionEncoder.cs +++ b/TensorSharp.Models/Models/Gemma4/Gemma4VisionEncoder.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using TensorSharp.Cpu; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { public class Gemma4VisionEncoder : IDisposable { @@ -241,6 +241,10 @@ private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, int hea var result = new Tensor(_allocator, DType.Float32, postAttnNormed.Sizes); Ops.Add(result, postAttnNormed, postFfnNormed); + string scaleKey = $"v.blk.{blockIdx}.out_scale.weight"; + if (_weights.TryGetValue(scaleKey, out var scaleTensor)) + Ops.Mul(result, result, scaleTensor); + return result; } @@ -461,6 +465,14 @@ private unsafe Tensor PoolAndProject(Tensor visionOutput, int patchesX, int patc float scale = MathF.Sqrt(_hiddenSize); Ops.Mul(pooled, pooled, scale); + // Vision standardization before projection (matches Ollama) + if (_weights.TryGetValue("v.std_bias", out var stdBias) && + _weights.TryGetValue("v.std_scale", out var stdScale)) + { + Ops.Sub(pooled, pooled, stdBias); + Ops.Mul(pooled, pooled, stdScale); + } + // Project to text dimension + unweighted RMSNorm var projected = LinearProjection(pooled, "mm.input_projection.weight"); pooled.Dispose(); @@ -579,3 +591,4 @@ public void Dispose() } } } + diff --git a/InferenceEngine/Models/GptOss/GptOssModel.cs b/TensorSharp.Models/Models/GptOss/GptOssModel.cs similarity index 99% rename from InferenceEngine/Models/GptOss/GptOssModel.cs rename to TensorSharp.Models/Models/GptOss/GptOssModel.cs index 1471463..03a02f6 100644 --- a/InferenceEngine/Models/GptOss/GptOssModel.cs +++ b/TensorSharp.Models/Models/GptOss/GptOssModel.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -15,7 +15,7 @@ using TensorSharp; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { /// /// GPT OSS (Mixture-of-Experts) transformer model. @@ -337,6 +337,7 @@ private void PrecomputeConstants() private void InitKVCache(int maxSeqLen) { + _maxContextLength = maxSeqLen; int numKVHeads = Config.NumKVHeads; int headDim = Config.HeadDim; _kvCacheK = new Tensor[Config.NumLayers]; @@ -1014,3 +1015,4 @@ public override void Dispose() } } } + diff --git a/TensorSharp.Models/Models/Mistral3/Mistral3ImageProcessor.cs b/TensorSharp.Models/Models/Mistral3/Mistral3ImageProcessor.cs new file mode 100644 index 0000000..131b410 --- /dev/null +++ b/TensorSharp.Models/Models/Mistral3/Mistral3ImageProcessor.cs @@ -0,0 +1,148 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System; +using System.IO; +using System.Threading.Tasks; + +namespace TensorSharp.Models +{ + /// + /// Image processor for Mistral 3 / Pixtral models. + /// Processing pipeline: + /// 1. Composite transparent images over white background + /// 2. Resize to fit longest_edge while preserving aspect ratio + /// 3. Pad to be divisible by patch_size + /// 4. Normalize with CLIP default mean/std + /// + public class Mistral3ImageProcessor + { + public int ImageSize { get; } + public int PatchSize { get; } + public int NumChannels { get; } + public int LongestEdge { get; } + + // CLIP default normalization parameters + private static readonly float[] ClipMean = { 0.48145466f, 0.4578275f, 0.40821073f }; + private static readonly float[] ClipStd = { 0.26862954f, 0.26130258f, 0.27577711f }; + + // Special token IDs for Mistral 3 vision + public const int ImgTokenId = 10; + public const int ImgBreakTokenId = 12; + public const int ImgEndTokenId = 13; + + public Mistral3ImageProcessor(int imageSize = 1540, int patchSize = 14, + int numChannels = 3, int longestEdge = 1540) + { + ImageSize = imageSize; + PatchSize = patchSize; + NumChannels = numChannels; + LongestEdge = longestEdge; + } + + /// + /// Process an image file for Mistral 3 vision encoder. + /// Returns (pixelValues, finalWidth, finalHeight). + /// pixelValues is in channel-first format [C, H, W], normalized with CLIP mean/std. + /// + public (float[] pixels, int width, int height) ProcessImage(string imagePath) + { + byte[] fileBytes = File.ReadAllBytes(imagePath); + byte[] rgba = Gemma3ImageProcessor.DecodeImageToRGBA(fileBytes, out int origWidth, out int origHeight); + + // Composite over white background + rgba = Gemma3ImageProcessor.CompositeOverWhite(rgba, origWidth, origHeight); + + // Resize to fit longest_edge + double ratio = Math.Max((double)origHeight / LongestEdge, (double)origWidth / LongestEdge); + int newWidth = origWidth, newHeight = origHeight; + if (ratio > 1.0) + { + newWidth = (int)Math.Floor(origWidth / ratio); + newHeight = (int)Math.Floor(origHeight / ratio); + } + + // Pad to be divisible by patch_size + int patchesX = (newWidth - 1) / PatchSize + 1; + int patchesY = (newHeight - 1) / PatchSize + 1; + int finalWidth = patchesX * PatchSize; + int finalHeight = patchesY * PatchSize; + + // Resize and normalize + float[] pixels = ResizeAndNormalize(rgba, origWidth, origHeight, finalWidth, finalHeight); + + Console.WriteLine($"Mistral3 image: {origWidth}x{origHeight} → {finalWidth}x{finalHeight} " + + $"({patchesX}x{patchesY} patches)"); + + return (pixels, finalWidth, finalHeight); + } + + /// + /// Bilinear resize + CLIP normalization in a single pass. + /// Output is channel-first: [R..., G..., B...]. + /// + private float[] ResizeAndNormalize(byte[] rgba, int srcW, int srcH, int dstW, int dstH) + { + int pixels = dstW * dstH; + float[] result = new float[3 * pixels]; + double xRatio = (double)srcW / dstW; + double yRatio = (double)srcH / dstH; + + Parallel.For(0, dstH, dy => + { + double srcY = (dy + 0.5) * yRatio - 0.5; + int y0 = Math.Max(0, (int)srcY); + int y1 = Math.Min(srcH - 1, y0 + 1); + double fy = srcY - y0; + + for (int dx = 0; dx < dstW; dx++) + { + double srcX = (dx + 0.5) * xRatio - 0.5; + int x0 = Math.Max(0, (int)srcX); + int x1 = Math.Min(srcW - 1, x0 + 1); + double fx = srcX - x0; + + int dstIdx = dy * dstW + dx; + + for (int c = 0; c < 3; c++) + { + double v00 = rgba[(y0 * srcW + x0) * 4 + c] / 255.0; + double v01 = rgba[(y0 * srcW + x1) * 4 + c] / 255.0; + double v10 = rgba[(y1 * srcW + x0) * 4 + c] / 255.0; + double v11 = rgba[(y1 * srcW + x1) * 4 + c] / 255.0; + + double v = v00 * (1 - fx) * (1 - fy) + v01 * fx * (1 - fy) + + v10 * (1 - fx) * fy + v11 * fx * fy; + + result[c * pixels + dstIdx] = (float)((v - ClipMean[c]) / ClipStd[c]); + } + } + }); + + return result; + } + + /// + /// Compute the number of vision tokens for a processed image. + /// After patch merging, tokens = (patchesW / mergeSize) * (patchesH / mergeSize). + /// Each row becomes [IMG]...[IMG] tokens, rows separated by [IMG_BREAK], ending with [IMG_END]. + /// + public int ComputeVisionTokenCount(int imageWidth, int imageHeight, int spatialMergeSize) + { + int patchesW = imageWidth / PatchSize; + int patchesH = imageHeight / PatchSize; + int mergedW = patchesW / spatialMergeSize; + int mergedH = patchesH / spatialMergeSize; + + // mergedH rows of mergedW [IMG] tokens each + // Plus (mergedH - 1) [IMG_BREAK] tokens and 1 [IMG_END] token + return mergedW * mergedH + mergedH; + } + } +} diff --git a/TensorSharp.Models/Models/Mistral3/Mistral3Model.cs b/TensorSharp.Models/Models/Mistral3/Mistral3Model.cs new file mode 100644 index 0000000..5a2435f --- /dev/null +++ b/TensorSharp.Models/Models/Mistral3/Mistral3Model.cs @@ -0,0 +1,632 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System; +using System.Collections.Generic; +using System.Diagnostics; +using TensorSharp; +using TensorSharp.GGML; + +namespace TensorSharp.Models +{ + /// + /// Mistral 3 model architecture. + /// Key features: + /// - Standard LLaMA-like transformer with SiLU-gated MLP (SwiGLU) + /// - GPT-J (norm) style RoPE with YaRN scaling for extended context + /// - Position-dependent Q scaling: q *= (1 + beta * log(1 + floor(pos / orig_ctx))) + /// - No QK-norm (unlike Qwen3/Gemma3) + /// - Supports multimodal (vision) via separate Pixtral vision encoder + /// + public class Mistral3Model : ModelBase + { + private Tensor[] _kvCacheK; + private Tensor[] _kvCacheV; + + private string[][] _layerWeightNames; + private float[] _ropeFreqs; + private int _ropeDim; + private int _attnKeyLen; + private int _attnValLen; + + // YaRN scaling parameters + private float _ropeScalingBeta; + private int _ropeOrigCtx; + private float _ropeExtFactor; + private float _ropeBetaFast; + private float _ropeBetaSlow; + private float _ropeMscale; + private float _ropeMscaleAllDim; + private string _ropeType; + + // Vision support + private Mistral3VisionEncoder _visionEncoder; + private List<(Tensor embeddings, int position)> _pendingVisionEmbeddingsList = new(); + + public Mistral3Model(string ggufPath, BackendType backend) + : base(ggufPath, backend) + { + string arch = _gguf.GetString("general.architecture") ?? "mistral3"; + Config = new ModelConfig { Architecture = arch }; + ParseBaseConfig(); + + _attnKeyLen = Config.KeyLength > 0 ? Config.KeyLength : Config.HeadDim; + _attnValLen = Config.ValueLength > 0 ? Config.ValueLength : _attnKeyLen; + _ropeDim = (int)_gguf.GetUint32($"{arch}.rope.dimension_count", (uint)_attnKeyLen); + + // YaRN parameters + _ropeType = _gguf.GetString($"{arch}.rope.scaling.type", ""); + _ropeScalingBeta = _gguf.GetFloat32($"{arch}.attention.temperature_scale", + _gguf.GetFloat32($"{arch}.rope.scaling_beta", 0.1f)); + _ropeOrigCtx = (int)_gguf.GetUint32($"{arch}.rope.scaling.original_context_length", 0); + Config.OriginalContextLength = _ropeOrigCtx; + _ropeExtFactor = _gguf.GetFloat32($"{arch}.rope.scaling.extrapolation_factor", 1.0f); + _ropeBetaFast = _gguf.GetFloat32($"{arch}.rope.scaling.yarn_beta_fast", + _gguf.GetFloat32($"{arch}.rope.scaling.beta_fast", 32.0f)); + _ropeBetaSlow = _gguf.GetFloat32($"{arch}.rope.scaling.yarn_beta_slow", + _gguf.GetFloat32($"{arch}.rope.scaling.beta_slow", 1.0f)); + _ropeMscale = _gguf.GetFloat32($"{arch}.rope.scaling.mscale", 0f); + _ropeMscaleAllDim = _gguf.GetFloat32($"{arch}.rope.scaling.mscale_all_dim", 0f); + + Console.WriteLine($"Model: {arch}, Layers={Config.NumLayers}, Hidden={Config.HiddenSize}, " + + $"Heads={Config.NumHeads}, KVHeads={Config.NumKVHeads}, KeyLen={_attnKeyLen}, " + + $"ValLen={_attnValLen}, Vocab={Config.VocabSize}"); + Console.WriteLine($"RoPE base={Config.RopeBase}, scale={Config.RopeScale}, type={_ropeType}, " + + $"dim={_ropeDim}, origCtx={_ropeOrigCtx}"); + if (_ropeType == "yarn") + Console.WriteLine($"YaRN beta={_ropeScalingBeta}, betaFast={_ropeBetaFast}, " + + $"betaSlow={_ropeBetaSlow}, extFactor={_ropeExtFactor}"); + + ParseTokenizer(); + LoadWeights(); + FuseQKVWeights(); + FuseGateUpWeights(); + + int maxCtx = 4096; + string ctxEnv = Environment.GetEnvironmentVariable("MAX_CONTEXT"); + if (!string.IsNullOrEmpty(ctxEnv) && int.TryParse(ctxEnv, out int envCtx) && envCtx > 0) + maxCtx = envCtx; + + InitKVCache(maxCtx); + PrecomputeConstants(); + } + + private unsafe void FuseQKVWeights() + { + int fused = 0; + for (int l = 0; l < Config.NumLayers; l++) + { + string qName = $"blk.{l}.attn_q.weight"; + string kName = $"blk.{l}.attn_k.weight"; + string vName = $"blk.{l}.attn_v.weight"; + string qkvName = $"blk.{l}.attn_qkv.weight"; + + if (_quantWeights.TryGetValue(qName, out var qw) && + _quantWeights.TryGetValue(kName, out var kw) && + _quantWeights.TryGetValue(vName, out var vw) && + qw.GgmlType == kw.GgmlType && kw.GgmlType == vw.GgmlType && + qw.Ne0 == kw.Ne0 && kw.Ne0 == vw.Ne0) + { + long totalBytes = qw.RawBytes + kw.RawBytes + vw.RawBytes; + IntPtr fusedPtr = QuantizedWeight.AllocateBuffer(totalBytes); + Buffer.MemoryCopy(qw.Data.ToPointer(), fusedPtr.ToPointer(), totalBytes, qw.RawBytes); + Buffer.MemoryCopy(kw.Data.ToPointer(), (fusedPtr + (int)qw.RawBytes).ToPointer(), totalBytes - qw.RawBytes, kw.RawBytes); + Buffer.MemoryCopy(vw.Data.ToPointer(), (fusedPtr + (int)(qw.RawBytes + kw.RawBytes)).ToPointer(), totalBytes - qw.RawBytes - kw.RawBytes, vw.RawBytes); + _quantWeights[qkvName] = new QuantizedWeight(fusedPtr, totalBytes, qw.GgmlType, qw.Ne0, qw.Ne1 + kw.Ne1 + vw.Ne1); + _quantWeights.Remove(qName); qw.Dispose(); + _quantWeights.Remove(kName); kw.Dispose(); + _quantWeights.Remove(vName); vw.Dispose(); + fused++; + } + else if (_weights.TryGetValue(qName, out var qf) && + _weights.TryGetValue(kName, out var kf) && + _weights.TryGetValue(vName, out var vf)) + { + int qDim = (int)qf.Sizes[0], kDim = (int)kf.Sizes[0], vDim = (int)vf.Sizes[0]; + int inDim = (int)qf.Sizes[1]; + var fusedTensor = new Tensor(_allocator, DType.Float32, qDim + kDim + vDim, inDim); + using (var s0 = fusedTensor.Narrow(0, 0, qDim)) Ops.Copy(s0, qf); + using (var s1 = fusedTensor.Narrow(0, qDim, kDim)) Ops.Copy(s1, kf); + using (var s2 = fusedTensor.Narrow(0, qDim + kDim, vDim)) Ops.Copy(s2, vf); + _weights[qkvName] = fusedTensor; + _weights.Remove(qName); qf.Dispose(); + _weights.Remove(kName); kf.Dispose(); + _weights.Remove(vName); vf.Dispose(); + fused++; + } + } + if (fused > 0) + Console.WriteLine($" Fused projections: {fused} QKV"); + } + + private bool[] _layerQkvFused; + + private void PrecomputeConstants() + { + int numLayers = Config.NumLayers; + _layerQkvFused = new bool[numLayers]; + + _layerWeightNames = new string[numLayers][]; + for (int l = 0; l < numLayers; l++) + { + string p = $"blk.{l}."; + bool fused = _quantWeights.ContainsKey(p + "attn_qkv.weight") || + _weights.ContainsKey(p + "attn_qkv.weight"); + _layerQkvFused[l] = fused; + + if (fused) + { + _layerWeightNames[l] = new[] + { + p + "attn_norm.weight", // 0 + p + "attn_qkv.weight", // 1 + p + "attn_output.weight", // 2 + p + "ffn_norm.weight", // 3 + p + "ffn_gate_up.weight", // 4 + p + "ffn_down.weight", // 5 + }; + } + else + { + _layerWeightNames[l] = new[] + { + p + "attn_norm.weight", // 0 + p + "attn_q.weight", // 1 + p + "attn_k.weight", // 2 + p + "attn_v.weight", // 3 + p + "attn_output.weight", // 4 + p + "ffn_norm.weight", // 5 + p + "ffn_gate_up.weight", // 6 + p + "ffn_down.weight", // 7 + }; + } + } + + int halfDim = _ropeDim / 2; + float freqScale = 1.0f / Config.RopeScale; + _ropeFreqs = new float[halfDim]; + for (int i = 0; i < halfDim; i++) + _ropeFreqs[i] = freqScale / MathF.Pow(Config.RopeBase, (2.0f * i) / _ropeDim); + + if (_ropeType == "yarn" && _ropeOrigCtx > 0) + ApplyYarnFreqCorrection(_ropeFreqs, halfDim); + } + + /// + /// Apply YaRN frequency correction to precomputed RoPE frequencies for decode path. + /// Interpolates between extrapolated and interpolated frequencies based on + /// whether each frequency band is within the "slow" or "fast" rotation range. + /// + private void ApplyYarnFreqCorrection(float[] freqs, int halfDim) + { + float lowFreqWavelen = (float)(_ropeOrigCtx / _ropeBetaSlow); + float highFreqWavelen = (float)(_ropeOrigCtx / _ropeBetaFast); + + for (int i = 0; i < halfDim; i++) + { + float origFreq = 1.0f / MathF.Pow(Config.RopeBase, (2.0f * i) / _ropeDim); + float wavelen = 2.0f * MathF.PI / origFreq; + + if (wavelen < highFreqWavelen) + { + // High frequency: use original frequency (extrapolation) + freqs[i] = origFreq; + } + else if (wavelen > lowFreqWavelen) + { + // Low frequency: use interpolated frequency + freqs[i] = origFreq / Config.RopeScale; + } + else + { + // Intermediate: smooth blend between interpolated and extrapolated + float smooth = (lowFreqWavelen / wavelen - 1.0f) / + (lowFreqWavelen / highFreqWavelen - 1.0f); + float interpFreq = origFreq / Config.RopeScale; + freqs[i] = (1.0f - smooth) * interpFreq + smooth * origFreq; + } + } + } + + private void InitKVCache(int maxSeqLen) + { + _maxContextLength = maxSeqLen; + int numKVHeads = Config.NumKVHeads; + _kvCacheK = new Tensor[Config.NumLayers]; + _kvCacheV = new Tensor[Config.NumLayers]; + for (int l = 0; l < Config.NumLayers; l++) + { + _kvCacheK[l] = new Tensor(_allocator, DType.Float32, numKVHeads, maxSeqLen, _attnKeyLen); + _kvCacheV[l] = new Tensor(_allocator, DType.Float32, numKVHeads, maxSeqLen, _attnValLen); + Ops.Fill(_kvCacheK[l], 0); + Ops.Fill(_kvCacheV[l], 0); + } + _cacheSeqLen = 0; + } + + public override void ResetKVCache() + { + for (int l = 0; l < Config.NumLayers; l++) + { + Ops.Fill(_kvCacheK[l], 0); + Ops.Fill(_kvCacheV[l], 0); + InvalidateTensorDeviceCache(_kvCacheK[l]); + InvalidateTensorDeviceCache(_kvCacheV[l]); + } + _cacheSeqLen = 0; + _linearTicks = _attnTicks = _normTicks = _embTicks = _lmHeadTicks = _logitsCopyTicks = 0; + _forwardCount = 0; + _forwardSw.Reset(); + } + + public override void TruncateKVCache(int tokenCount) + { + base.TruncateKVCache(tokenCount); + for (int l = 0; l < Config.NumLayers; l++) + { + InvalidateTensorDeviceCache(_kvCacheK[l]); + InvalidateTensorDeviceCache(_kvCacheV[l]); + } + } + + // Vision support + public void LoadVisionEncoder(string mmProjPath) + { + _visionEncoder = new Mistral3VisionEncoder(mmProjPath, _allocator); + } + + public void SetVisionEmbeddings(Tensor embeddings, int insertPosition) + { + _pendingVisionEmbeddingsList.Add((embeddings, insertPosition)); + } + + public Mistral3VisionEncoder VisionEncoder => _visionEncoder; + + public override float[] Forward(int[] tokens) + { + _forwardSw.Start(); + int seqLen = tokens.Length; + int startPos = _cacheSeqLen; + + long t1 = Stopwatch.GetTimestamp(); + Tensor hidden = Embedding(tokens); + _embTicks += Stopwatch.GetTimestamp() - t1; + + if (_pendingVisionEmbeddingsList.Count > 0) + { + foreach (var (embeddings, position) in _pendingVisionEmbeddingsList) + { + InjectVisionEmbeddings(hidden, embeddings, position); + embeddings.Dispose(); + } + _pendingVisionEmbeddingsList.Clear(); + } + + for (int layer = 0; layer < Config.NumLayers; layer++) + { + hidden = TransformerBlock(hidden, layer, seqLen, startPos); + } + + Tensor normed = RMSNormOp(hidden, "output_norm.weight"); + hidden.Dispose(); + + Tensor lastHidden; + if (seqLen > 1) + { + using var narrowed = normed.Narrow(0, seqLen - 1, 1); + lastHidden = Ops.NewContiguous(narrowed); + } + else + { + lastHidden = normed.CopyRef(); + } + normed.Dispose(); + + long t2 = Stopwatch.GetTimestamp(); + Tensor logitsTensor = LinearForward(lastHidden, "output.weight"); + if (logitsTensor == null) + logitsTensor = LinearForward(lastHidden, "token_embd.weight"); + _lmHeadTicks += Stopwatch.GetTimestamp() - t2; + lastHidden.Dispose(); + + long t3 = Stopwatch.GetTimestamp(); + _logitsBuffer = TensorToFloatArray(logitsTensor); + _logitsCopyTicks += Stopwatch.GetTimestamp() - t3; + logitsTensor.Dispose(); + + _cacheSeqLen += seqLen; + _forwardCount++; + _forwardSw.Stop(); + return _logitsBuffer; + } + + private unsafe void InjectVisionEmbeddings(Tensor hidden, Tensor visionEmbeddings, int insertPos) + { + int numVisionTokens = (int)visionEmbeddings.Sizes[0]; + int dim = Config.HiddenSize; + float* hPtr = GetFloatPtr(hidden); + float* vPtr = GetFloatPtr(visionEmbeddings); + + for (int t = 0; t < numVisionTokens; t++) + { + float* dst = hPtr + (long)(insertPos + t) * dim; + float* src = vPtr + (long)t * dim; + Buffer.MemoryCopy(src, dst, dim * sizeof(float), dim * sizeof(float)); + } + + Console.WriteLine($"Injected {numVisionTokens} vision tokens at position {insertPos}"); + } + + private Tensor TransformerBlock(Tensor hidden, int layer, int seqLen, int startPos) + { + string[] wn = _layerWeightNames[layer]; + + bool fused = _layerQkvFused[layer]; + int normIdx = 0; + int ffnNormIdx = fused ? 3 : 5; + int gateUpIdx = fused ? 4 : 6; + int downIdx = fused ? 5 : 7; + + Tensor normed = RMSNormOp(hidden, wn[normIdx]); + Tensor attnOut = Attention(normed, layer, wn, seqLen, startPos); + normed.Dispose(); + + Ops.Add(hidden, hidden, attnOut); + attnOut.Dispose(); + + Tensor normed2 = RMSNormOp(hidden, wn[ffnNormIdx]); + Tensor ffnOut = FFN(normed2, wn[gateUpIdx], wn[downIdx], seqLen); + normed2.Dispose(); + + Ops.Add(hidden, hidden, ffnOut); + ffnOut.Dispose(); + + return hidden; + } + + private Tensor Attention(Tensor input, int layer, string[] wn, int seqLen, int startPos) + { + int numHeads = Config.NumHeads; + int numKVHeads = Config.NumKVHeads; + int headDim = _attnKeyLen; + int qDim = numHeads * headDim; + int kDim = numKVHeads * headDim; + int totalSeqLen = startPos + seqLen; + float scale = 1.0f / MathF.Sqrt(headDim); + + Tensor qTensor, kTensor, vTensor; + + bool layerFused = _layerQkvFused[layer]; + if (layerFused) + { + Tensor qkvFused = LinearForward(input, wn[1]); + if (seqLen == 1) + { + qTensor = qkvFused.Narrow(1, 0, qDim); + kTensor = qkvFused.Narrow(1, qDim, kDim); + vTensor = qkvFused.Narrow(1, qDim + kDim, kDim); + qkvFused.Dispose(); + } + else + { + using (var qView = qkvFused.Narrow(1, 0, qDim)) + qTensor = Ops.NewContiguous(qView); + using (var kView = qkvFused.Narrow(1, qDim, kDim)) + kTensor = Ops.NewContiguous(kView); + using (var vView = qkvFused.Narrow(1, qDim + kDim, kDim)) + vTensor = Ops.NewContiguous(vView); + qkvFused.Dispose(); + } + } + else + { + qTensor = LinearForward(input, wn[1]); // attn_q + kTensor = LinearForward(input, wn[2]); // attn_k + vTensor = LinearForward(input, wn[3]); // attn_v + } + + if (seqLen == 1) + { + ApplyRoPEDecode(qTensor, numHeads, headDim, startPos); + ApplyRoPEDecode(kTensor, numKVHeads, headDim, startPos); + + // Position-dependent Q scaling for YaRN + if (_ropeOrigCtx > 0) + ApplyPositionScale(qTensor, numHeads * headDim, startPos); + } + else + { + qTensor = ApplyRoPEPrefill(qTensor, numHeads, headDim, seqLen, startPos); + kTensor = ApplyRoPEPrefill(kTensor, numKVHeads, headDim, seqLen, startPos); + + // Position-dependent Q scaling for YaRN + if (_ropeOrigCtx > 0) + ApplyPositionScalePrefill(qTensor, numHeads, headDim, seqLen, startPos); + } + + long t0 = Stopwatch.GetTimestamp(); + + if (seqLen == 1) + { + CopyToCacheDecode(_kvCacheK[layer], kTensor, _kvCacheV[layer], vTensor, + numKVHeads, headDim, startPos); + kTensor.Dispose(); + vTensor.Dispose(); + + var attnResult = new Tensor(_allocator, DType.Float32, 1, numHeads * headDim); + AttentionDecodePureCS(qTensor, _kvCacheK[layer], _kvCacheV[layer], + attnResult, numHeads, numKVHeads, headDim, totalSeqLen, scale); + qTensor.Dispose(); + + _attnTicks += Stopwatch.GetTimestamp() - t0; + + int outputIdx = layerFused ? 2 : 4; + Tensor decodeOut = LinearForward(attnResult, wn[outputIdx]); + attnResult.Dispose(); + return decodeOut; + } + + Tensor qHeads = ReshapeToHeads(qTensor, numHeads, seqLen, headDim); + qTensor.Dispose(); + Tensor kHeads = ReshapeToHeads(kTensor, numKVHeads, seqLen, headDim); + kTensor.Dispose(); + Tensor vHeads = ReshapeToHeads(vTensor, numKVHeads, seqLen, _attnValLen); + vTensor.Dispose(); + + CopyToCache(_kvCacheK[layer], kHeads, startPos, seqLen); + CopyToCache(_kvCacheV[layer], vHeads, startPos, seqLen); + kHeads.Dispose(); + vHeads.Dispose(); + + int groupSize = numHeads / numKVHeads; + Tensor kExpanded = ExpandKVHeads(_kvCacheK[layer], groupSize, totalSeqLen); + Tensor vExpanded = ExpandKVHeads(_kvCacheV[layer], groupSize, totalSeqLen); + + using var kT = kExpanded.Transpose(1, 2); + var scores = new Tensor(_allocator, DType.Float32, numHeads, seqLen, totalSeqLen); + Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT); + qHeads.Dispose(); + kExpanded.Dispose(); + + Ops.AddCausalMask(scores, seqLen, startPos, float.NegativeInfinity); + Ops.Softmax(scores, scores); + + var attnOut = new Tensor(_allocator, DType.Float32, numHeads, seqLen, _attnValLen); + Ops.AddmmBatch(attnOut, 0, attnOut, 1.0f, scores, vExpanded); + scores.Dispose(); + vExpanded.Dispose(); + + Tensor flatOutput = ReshapeFromHeads(attnOut, numHeads, seqLen, _attnValLen); + attnOut.Dispose(); + + _attnTicks += Stopwatch.GetTimestamp() - t0; + + int outIdx = layerFused ? 2 : 4; + Tensor output = LinearForward(flatOutput, wn[outIdx]); + flatOutput.Dispose(); + + return output; + } + + /// + /// GPT-J (norm) style RoPE: pairs adjacent elements (x[2i], x[2i+1]). + /// Uses precomputed YaRN-corrected frequencies for decode. + /// + private unsafe void ApplyRoPEDecode(Tensor data, int numHeads, int headDim, int position) + { + int halfDim = _ropeDim / 2; + float* ptr = GetFloatPtr(data); + + float* cosTable = stackalloc float[halfDim]; + float* sinTable = stackalloc float[halfDim]; + for (int i = 0; i < halfDim; i++) + { + float theta = position * _ropeFreqs[i]; + cosTable[i] = MathF.Cos(theta); + sinTable[i] = MathF.Sin(theta); + } + + for (int h = 0; h < numHeads; h++) + { + float* head = ptr + h * headDim; + for (int i = 0; i < halfDim; i++) + { + float x0 = head[2 * i]; + float x1 = head[2 * i + 1]; + head[2 * i] = x0 * cosTable[i] - x1 * sinTable[i]; + head[2 * i + 1] = x0 * sinTable[i] + x1 * cosTable[i]; + } + } + } + + private Tensor ApplyRoPEPrefill(Tensor data, int numHeads, int headDim, int seqLen, int startPos) + { + int totalRows = seqLen * numHeads; + int[] positions = new int[totalRows]; + for (int s = 0; s < seqLen; s++) + for (int h = 0; h < numHeads; h++) + positions[s * numHeads + h] = startPos + s; + using var posTensor = CreateIntTensor(positions, totalRows); + + using var reshaped = data.View(1, seqLen, numHeads, headDim); + Tensor result = Ops.RoPEEx( + null, reshaped, posTensor, _ropeDim, 0, _ropeOrigCtx, + Config.RopeBase, 1.0f / Config.RopeScale, + _ropeType == "yarn" ? _ropeExtFactor : 0f, + ComputeAttnFactor(), + _ropeType == "yarn" ? _ropeBetaFast : 0f, + _ropeType == "yarn" ? _ropeBetaSlow : 0f); + + data.Dispose(); + + Tensor flat = result.View(seqLen, numHeads * headDim); + result.Dispose(); + return flat; + } + + private float ComputeAttnFactor() + { + if (_ropeMscale != 0 && _ropeMscaleAllDim != 0) + return 1.0f / (0.1f * MathF.Log(Config.RopeScale) + 1.0f); + return 1.0f; + } + + /// + /// Position-dependent Q scaling for YaRN: + /// q *= (1 + beta * log(1 + floor(pos / orig_ctx))) + /// + private unsafe void ApplyPositionScale(Tensor qTensor, int totalQDim, int position) + { + float interval = MathF.Floor((float)position / _ropeOrigCtx); + float posScale = 1.0f + _ropeScalingBeta * MathF.Log(1.0f + interval); + if (MathF.Abs(posScale - 1.0f) < 1e-7f) + return; + + float* ptr = GetFloatPtr(qTensor); + VecScale(ptr, posScale, totalQDim); + } + + private unsafe void ApplyPositionScalePrefill(Tensor qTensor, int numHeads, int headDim, + int seqLen, int startPos) + { + float* ptr = GetFloatPtr(qTensor); + int stride = numHeads * headDim; + + for (int s = 0; s < seqLen; s++) + { + int pos = startPos + s; + float interval = MathF.Floor((float)pos / _ropeOrigCtx); + float posScale = 1.0f + _ropeScalingBeta * MathF.Log(1.0f + interval); + if (MathF.Abs(posScale - 1.0f) < 1e-7f) + continue; + VecScale(ptr + (long)s * stride, posScale, stride); + } + } + + // Native batch decode is not used for Mistral 3 because YaRN applies + // per-dimension frequency correction that the generic TransformerLayerDecode + // API cannot express. The C# decode path uses GGML-backed matmul/attention + // and only adds a lightweight C# RoPE kernel. + + public override void Dispose() + { + _visionEncoder?.Dispose(); + foreach (var (embeddings, _) in _pendingVisionEmbeddingsList) + embeddings?.Dispose(); + _pendingVisionEmbeddingsList.Clear(); + + if (_kvCacheK != null) + foreach (var t in _kvCacheK) t?.Dispose(); + if (_kvCacheV != null) + foreach (var t in _kvCacheV) t?.Dispose(); + + base.Dispose(); + } + } +} diff --git a/TensorSharp.Models/Models/Mistral3/Mistral3VisionEncoder.cs b/TensorSharp.Models/Models/Mistral3/Mistral3VisionEncoder.cs new file mode 100644 index 0000000..93f3b44 --- /dev/null +++ b/TensorSharp.Models/Models/Mistral3/Mistral3VisionEncoder.cs @@ -0,0 +1,527 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System; +using System.Collections.Generic; +using TensorSharp; +using TensorSharp.Cpu; +using TensorSharp.GGML; + +namespace TensorSharp.Models +{ + /// + /// Pixtral-style vision encoder for Mistral 3. + /// Architecture: + /// - Conv2D patch embedding + /// - RMSNorm on patch embeddings (encoder_norm) + /// - 2D RoPE positional embeddings (computed on-the-fly) + /// - Transformer blocks with RMSNorm, SiLU-gated MLP + /// - Patch merger with spatial merge + /// - Multi-modal projector: RMSNorm → PatchMerger → Linear → GELU → Linear + /// + public class Mistral3VisionEncoder : IDisposable + { + private readonly Dictionary _weights = new(); + private readonly Dictionary _quantWeights = new(); + private readonly Dictionary _transposedWeights = new(); + private readonly IAllocator _allocator; + private readonly bool _useNativeAttention; + + private readonly int _imageSize; + private readonly int _patchSize; + private readonly int _hiddenSize; + private readonly int _numHeads; + private readonly int _headDim; + private readonly int _blockCount; + private readonly float _eps; + private readonly float _visionRopeBase; + private readonly int _spatialMergeSize; + + // Multi-modal projector config + private readonly float _textEps; + + public int PatchSize => _patchSize; + public int SpatialMergeSize => _spatialMergeSize; + public int ImageSize => _imageSize; + + public Mistral3VisionEncoder(string mmProjPath, IAllocator allocator) + { + _allocator = allocator; + _useNativeAttention = allocator is GgmlAllocator; + var gguf = new GgufFile(mmProjPath); + + _imageSize = (int)gguf.GetUint32("vision.image_size", + (uint)gguf.GetUint32("clip.vision.image_size", 1540)); + _patchSize = (int)gguf.GetUint32("vision.patch_size", + (uint)gguf.GetUint32("clip.vision.patch_size", 14)); + _hiddenSize = (int)gguf.GetUint32("vision.embedding_length", + (uint)gguf.GetUint32("clip.vision.embedding_length", 1024)); + _numHeads = (int)gguf.GetUint32("vision.attention.head_count", + (uint)gguf.GetUint32("clip.vision.attention.head_count", 16)); + _headDim = (int)gguf.GetUint32("vision.attention.key_length", + (uint)(_hiddenSize / _numHeads)); + _blockCount = (int)gguf.GetUint32("vision.block_count", + (uint)gguf.GetUint32("clip.vision.block_count", 24)); + _eps = gguf.GetFloat32("vision.attention.layer_norm_epsilon", + gguf.GetFloat32("clip.vision.attention.layer_norm_epsilon", 1e-5f)); + _visionRopeBase = gguf.GetFloat32("vision.rope.freq_base", 10000.0f); + _spatialMergeSize = (int)gguf.GetUint32("spatial_merge_size", 2); + _textEps = gguf.GetFloat32("text_config.rms_norm_eps", 1e-5f); + + Console.WriteLine($"Mistral3 Vision: imageSize={_imageSize}, patchSize={_patchSize}, " + + $"hidden={_hiddenSize}, heads={_numHeads}, headDim={_headDim}, " + + $"blocks={_blockCount}, ropeBase={_visionRopeBase}, mergeSize={_spatialMergeSize}"); + + LoadWeights(gguf); + gguf.Dispose(); + } + + private void LoadWeights(GgufFile gguf) + { + Console.Write("Loading Mistral3 vision encoder weights..."); + int count = 0; + foreach (var kv in gguf.Tensors) + { + var info = kv.Value; + long numElements = info.NumElements; + + long[] ggufShape = new long[info.Shape.Length]; + for (int i = 0; i < info.Shape.Length; i++) + ggufShape[i] = (long)info.Shape[i]; + + long[] tsShape = new long[ggufShape.Length]; + for (int i = 0; i < ggufShape.Length; i++) + tsShape[i] = ggufShape[ggufShape.Length - 1 - i]; + + if (info.Type == GgmlTensorType.F32 || info.Shape.Length < 2) + { + float[] f32 = new float[numElements]; + byte[] raw = gguf.ReadTensorData(info); + if (info.Type == GgmlTensorType.F32) + Buffer.BlockCopy(raw, 0, f32, 0, raw.Length); + else + NativeDequant.DequantizeToFloat32((int)info.Type, raw, 0, f32, 0, numElements); + + var tensor = new Tensor(_allocator, DType.Float32, tsShape); + tensor.SetElementsAsFloat(f32); + _weights[info.Name] = tensor; + } + else + { + byte[] raw = gguf.ReadTensorData(info); + float[] f32 = new float[numElements]; + if (info.Type == GgmlTensorType.F32) + Buffer.BlockCopy(raw, 0, f32, 0, raw.Length); + else + NativeDequant.DequantizeToFloat32((int)info.Type, raw, 0, f32, 0, numElements); + + var tensor = new Tensor(_allocator, DType.Float32, tsShape); + tensor.SetElementsAsFloat(f32); + _weights[info.Name] = tensor; + } + count++; + } + Console.WriteLine($" done ({count} tensors)"); + } + + /// + /// Encode an image into vision embeddings ready for the text model. + /// Input: normalized pixel data, image dimensions. + /// Output: Tensor of shape [numOutputTokens, textHiddenSize]. + /// + public unsafe Tensor Encode(float[] pixelValues, int imageWidth, int imageHeight) + { + int numPatchesW = imageWidth / _patchSize; + int numPatchesH = imageHeight / _patchSize; + int numPatches = numPatchesW * numPatchesH; + + // Patch embedding via Conv2D + var hidden = PatchEmbed(pixelValues, imageWidth, imageHeight, numPatchesW, numPatchesH); + + // Encoder norm + using var normed = RMSNormOp(hidden, "v.encoder_norm.weight"); + hidden.Dispose(); + hidden = Ops.NewContiguous(normed); + + // 2D RoPE positional embeddings + var (cos, sin) = Compute2DRoPE(numPatchesW, numPatchesH); + + for (int i = 0; i < _blockCount; i++) + { + Console.Write($"\r Vision encoder block {i + 1}/{_blockCount}..."); + hidden = EncoderBlock(hidden, i, numPatches, cos, sin); + } + Console.WriteLine(" done"); + + cos.Dispose(); + sin.Dispose(); + + // Multi-modal projector + var projected = MultiModalProject(hidden, numPatchesW, numPatchesH); + hidden.Dispose(); + + return projected; + } + + private unsafe Tensor PatchEmbed(float[] pixelValues, int imgW, int imgH, + int patchesW, int patchesH) + { + int numPatches = patchesW * patchesH; + var result = new Tensor(_allocator, DType.Float32, numPatches, _hiddenSize); + float* dst = GetFloatPtr(result); + + var convWeight = _weights["v.patch_conv.weight"]; + float* wPtr = GetFloatPtr(convWeight); + float* biasPtr = _weights.ContainsKey("v.patch_conv.bias") + ? GetFloatPtr(_weights["v.patch_conv.bias"]) : null; + + int C = 3; + int P = _patchSize; + + for (int py = 0; py < patchesH; py++) + { + for (int px = 0; px < patchesW; px++) + { + int patchIdx = py * patchesW + px; + float* outPatch = dst + patchIdx * _hiddenSize; + + for (int f = 0; f < _hiddenSize; f++) + { + float sum = biasPtr != null ? biasPtr[f] : 0f; + + for (int c = 0; c < C; c++) + { + for (int ky = 0; ky < P; ky++) + { + for (int kx = 0; kx < P; kx++) + { + int imgY = py * P + ky; + int imgX = px * P + kx; + float pixel = pixelValues[c * imgH * imgW + imgY * imgW + imgX]; + int wIdx = f * C * P * P + c * P * P + ky * P + kx; + sum += pixel * wPtr[wIdx]; + } + } + } + outPatch[f] = sum; + } + } + } + + return result; + } + + /// + /// Compute 2D RoPE embeddings for the vision transformer. + /// Returns (cos, sin) tensors of shape [headDim, 1, numPatches]. + /// + private (Tensor cos, Tensor sin) Compute2DRoPE(int patchesW, int patchesH) + { + int maxPatchesPerSide = _imageSize / _patchSize; + int numPatches = patchesW * patchesH; + int frequencies = _headDim / 2; + + float[] freqsHeight = new float[frequencies / 2 * maxPatchesPerSide]; + float[] freqsWidth = new float[frequencies / 2 * maxPatchesPerSide]; + + for (int i = 0; i < frequencies; i++) + { + for (int j = 0; j < maxPatchesPerSide; j++) + { + float frequency = (float)(j / Math.Pow(_visionRopeBase, (double)i * 2 / _headDim)); + if (i % 2 == 0) + freqsHeight[i / 2 * maxPatchesPerSide + j] = frequency; + else + freqsWidth[i / 2 * maxPatchesPerSide + j] = frequency; + } + } + + // Build per-position inverse frequencies + float[] invFreqs = new float[frequencies * numPatches]; + for (int h = 0; h < patchesH; h++) + { + for (int w = 0; w < patchesW; w++) + { + int patchIdx = h * patchesW + w; + for (int f = 0; f < frequencies / 2; f++) + { + invFreqs[f * numPatches + patchIdx] = freqsHeight[f * maxPatchesPerSide + h]; + invFreqs[(f + frequencies / 2) * numPatches + patchIdx] = freqsWidth[f * maxPatchesPerSide + w]; + } + } + } + + // Duplicate for cos+sin pairs + float[] fullFreqs = new float[_headDim * numPatches]; + Array.Copy(invFreqs, 0, fullFreqs, 0, frequencies * numPatches); + Array.Copy(invFreqs, 0, fullFreqs, frequencies * numPatches, frequencies * numPatches); + + // Compute cos and sin + float[] cosVals = new float[_headDim * numPatches]; + float[] sinVals = new float[_headDim * numPatches]; + for (int i = 0; i < fullFreqs.Length; i++) + { + cosVals[i] = MathF.Cos(fullFreqs[i]); + sinVals[i] = MathF.Sin(fullFreqs[i]); + } + + // Reshape to [headDim, 1, numPatches] + var cosTensor = new Tensor(_allocator, DType.Float32, numPatches, 1, _headDim); + cosTensor.SetElementsAsFloat(cosVals); + var sinTensor = new Tensor(_allocator, DType.Float32, numPatches, 1, _headDim); + sinTensor.SetElementsAsFloat(sinVals); + + return (cosTensor, sinTensor); + } + + private Tensor EncoderBlock(Tensor hidden, int blockIdx, int numPatches, + Tensor cos, Tensor sin) + { + string prefix = $"v.blk.{blockIdx}"; + + using var normed = RMSNormOp(hidden, $"{prefix}.attn_norm.weight"); + using var attnOut = VisionSelfAttention(normed, prefix, numPatches, cos, sin); + + Ops.Add(attnOut, attnOut, hidden); + hidden.Dispose(); + + using var normed2 = RMSNormOp(attnOut, $"{prefix}.ffn_norm.weight"); + using var mlpOut = VisionMLP(normed2, prefix); + + var result = new Tensor(_allocator, DType.Float32, attnOut.Sizes); + Ops.Add(result, attnOut, mlpOut); + + return result; + } + + private unsafe Tensor VisionSelfAttention(Tensor input, string prefix, int numPatches, + Tensor cos, Tensor sin) + { + using var q = LinearForward(input, $"{prefix}.attn_q.weight"); + using var k = LinearForward(input, $"{prefix}.attn_k.weight"); + using var v = LinearForward(input, $"{prefix}.attn_v.weight"); + + // Reshape to [numPatches, numHeads, headDim] + using var qR = q.View(numPatches, _numHeads, _headDim); + using var kR = k.View(numPatches, _numHeads, _headDim); + using var vR = v.View(numPatches, _numHeads, _headDim); + + // Apply 2D RoPE + var qRoped = ApplyVisionRoPE(qR, cos, sin, numPatches); + var kRoped = ApplyVisionRoPE(kR, cos, sin, numPatches); + + float scale = 1f / MathF.Sqrt(_headDim); + + if (_useNativeAttention) + { + using var q4 = qRoped.View(1, numPatches, _numHeads, _headDim); + using var k4 = kRoped.View(1, numPatches, _numHeads, _headDim); + using var v4 = vR.View(1, numPatches, _numHeads, _headDim); + using var attn4 = Ops.ScaledDotProductAttention(null, q4, k4, v4, null, scale); + qRoped.Dispose(); + kRoped.Dispose(); + using var flat = attn4.View(numPatches, _hiddenSize); + return LinearForward(flat, $"{prefix}.attn_output.weight"); + } + + // Manual attention path + using var qT0 = qRoped.Transpose(0, 1); + using var kT0 = kRoped.Transpose(0, 1); + using var vT0 = vR.Transpose(0, 1); + using var qHeads = Ops.NewContiguous(qT0); + using var kHeads = Ops.NewContiguous(kT0); + using var vHeads = Ops.NewContiguous(vT0); + qRoped.Dispose(); + kRoped.Dispose(); + + using var kT = kHeads.Transpose(1, 2); + var scores = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, numPatches); + Ops.AddmmBatch(scores, 0, scores, scale, qHeads, kT); + Ops.Softmax(scores, scores); + + var attnOutput = new Tensor(_allocator, DType.Float32, _numHeads, numPatches, _headDim); + Ops.AddmmBatch(attnOutput, 0, attnOutput, 1.0f, scores, vHeads); + scores.Dispose(); + + using var transposed = attnOutput.Transpose(0, 1); + using var contiguous = Ops.NewContiguous(transposed); + using var flatContig = contiguous.View(numPatches, _hiddenSize); + attnOutput.Dispose(); + + return LinearForward(flatContig, $"{prefix}.attn_output.weight"); + } + + /// + /// Apply rotary position embeddings (2D RoPE) for vision. + /// Uses rotate_half style: [-x1, x0] * sin + [x0, x1] * cos + /// + private unsafe Tensor ApplyVisionRoPE(Tensor input, Tensor cos, Tensor sin, int numPatches) + { + // input: [numPatches, numHeads, headDim] + var result = new Tensor(_allocator, DType.Float32, input.Sizes); + float* inPtr = GetFloatPtr(input); + float* outPtr = GetFloatPtr(result); + float* cosPtr = GetFloatPtr(cos); + float* sinPtr = GetFloatPtr(sin); + + int halfDim = _headDim / 2; + + for (int p = 0; p < numPatches; p++) + { + for (int h = 0; h < _numHeads; h++) + { + float* inHead = inPtr + (long)p * _numHeads * _headDim + h * _headDim; + float* outHead = outPtr + (long)p * _numHeads * _headDim + h * _headDim; + + for (int d = 0; d < halfDim; d++) + { + float x0 = inHead[d]; + float x1 = inHead[d + halfDim]; + float c = cosPtr[p * _headDim + d]; + float s = sinPtr[p * _headDim + d]; + + // rotate_half: cos*x - sin*rotate_half(x) + outHead[d] = x0 * c - x1 * s; + outHead[d + halfDim] = x1 * c + x0 * s; + } + } + } + + return result; + } + + private Tensor VisionMLP(Tensor input, string prefix) + { + using var gate = LinearForward(input, $"{prefix}.ffn_gate.weight"); + using var up = LinearForward(input, $"{prefix}.ffn_up.weight"); + Ops.SiLUMul(gate, gate, up); + return LinearForward(gate, $"{prefix}.ffn_down.weight"); + } + + /// + /// Multi-modal projector: vision → text space. + /// Steps: RMSNorm → PatchMerger → Linear1 → GELU → Linear2 + /// + private unsafe Tensor MultiModalProject(Tensor visionOutput, int patchesW, int patchesH) + { + int numPatches = patchesW * patchesH; + + // RMSNorm + using var normed = RMSNormOp(visionOutput, "mm.norm.weight"); + + // Patch merger: merge spatialMergeSize x spatialMergeSize patches + int mergedW = patchesW / _spatialMergeSize; + int mergedH = patchesH / _spatialMergeSize; + int mergedPatches = mergedW * mergedH; + int mergeInputDim = _hiddenSize * _spatialMergeSize * _spatialMergeSize; + + var mergeInput = new Tensor(_allocator, DType.Float32, mergedPatches, mergeInputDim); + float* srcPtr = GetFloatPtr(normed); + float* dstPtr = GetFloatPtr(mergeInput); + + for (int my = 0; my < mergedH; my++) + { + for (int mx = 0; mx < mergedW; mx++) + { + int outIdx = my * mergedW + mx; + float* outRow = dstPtr + (long)outIdx * mergeInputDim; + int fillOffset = 0; + + for (int sy = 0; sy < _spatialMergeSize; sy++) + { + for (int sx = 0; sx < _spatialMergeSize; sx++) + { + int srcY = my * _spatialMergeSize + sy; + int srcX = mx * _spatialMergeSize + sx; + int srcIdx = srcY * patchesW + srcX; + float* srcRow = srcPtr + (long)srcIdx * _hiddenSize; + + Buffer.MemoryCopy(srcRow, outRow + fillOffset, + _hiddenSize * sizeof(float), _hiddenSize * sizeof(float)); + fillOffset += _hiddenSize; + } + } + } + } + + // Patch merger linear + using var merged = LinearForward(mergeInput, "mm.patch_merger.merging_layer.weight"); + mergeInput.Dispose(); + + // Linear1 → GELU → Linear2 + using var proj1 = LinearForward(merged, "mm.linear_1.weight"); + Ops.GELU(proj1, proj1); + var proj2 = LinearForward(proj1, "mm.linear_2.weight"); + + Console.WriteLine($"Vision projector: {numPatches} patches → {mergedPatches} merged tokens " + + $"({(int)proj2.Sizes[0]}x{(int)proj2.Sizes[1]})"); + + return proj2; + } + + private Tensor RMSNormOp(Tensor input, string weightName) + { + if (!_weights.ContainsKey(weightName)) + return Ops.NewContiguous(input); + return Ops.RMSNorm(null, input, _weights[weightName], null, _eps); + } + + private Tensor LinearForward(Tensor input, string weightName) + { + if (!_weights.ContainsKey(weightName)) + return null; + + var weight = _weights[weightName]; + int seqLen = (int)input.Sizes[0]; + int outDim = (int)weight.Sizes[0]; + + var result = new Tensor(_allocator, DType.Float32, seqLen, outDim); + + Tensor contiguousInput = input.IsContiguous() ? null : Ops.NewContiguous(input); + Tensor src = contiguousInput ?? input; + Ops.Addmm(result, 0, result, 1.0f, src, GetOrCreateTransposedWeight(weightName)); + + contiguousInput?.Dispose(); + return result; + } + + private Tensor GetOrCreateTransposedWeight(string weightName) + { + if (_transposedWeights.TryGetValue(weightName, out var transposed)) + return transposed; + + using var weightViewT = _weights[weightName].Transpose(); + transposed = Ops.NewContiguous(weightViewT); + _transposedWeights[weightName] = transposed; + return transposed; + } + + private static unsafe float* GetFloatPtr(Tensor t) + { + if (t.Storage is GgmlStorage gs) + return (float*)gs.PtrAtElement(t.StorageOffset); + if (t.Storage is CpuStorage cs) + return (float*)cs.PtrAtElement(t.StorageOffset); + throw new NotSupportedException("Requires GgmlStorage or CpuStorage"); + } + + public void Dispose() + { + foreach (var w in _transposedWeights.Values) + w.Dispose(); + _transposedWeights.Clear(); + foreach (var w in _weights.Values) + w.Dispose(); + _weights.Clear(); + foreach (var qw in _quantWeights.Values) + qw.Dispose(); + _quantWeights.Clear(); + } + } +} diff --git a/InferenceEngine/Models/Nemotron/NemotronModel.cs b/TensorSharp.Models/Models/Nemotron/NemotronModel.cs similarity index 99% rename from InferenceEngine/Models/Nemotron/NemotronModel.cs rename to TensorSharp.Models/Models/Nemotron/NemotronModel.cs index 628e98d..1af610d 100644 --- a/InferenceEngine/Models/Nemotron/NemotronModel.cs +++ b/TensorSharp.Models/Models/Nemotron/NemotronModel.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -16,7 +16,7 @@ using TensorSharp; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { /// /// Nemotron-H hybrid model: mixes Mamba2 SSM layers, attention-only layers, and FFN-only layers. @@ -280,6 +280,7 @@ private void InitMamba2Buffers() private void InitCaches(int maxSeqLen) { + _maxContextLength = maxSeqLen; int numLayers = Config.NumLayers; _kvCacheK = new Tensor[numLayers]; _kvCacheV = new Tensor[numLayers]; @@ -1379,3 +1380,4 @@ public override void Dispose() } } } + diff --git a/InferenceEngine/Models/Qwen3/Qwen3Model.cs b/TensorSharp.Models/Models/Qwen3/Qwen3Model.cs similarity index 99% rename from InferenceEngine/Models/Qwen3/Qwen3Model.cs rename to TensorSharp.Models/Models/Qwen3/Qwen3Model.cs index 737e06c..d25b50b 100644 --- a/InferenceEngine/Models/Qwen3/Qwen3Model.cs +++ b/TensorSharp.Models/Models/Qwen3/Qwen3Model.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -12,7 +12,7 @@ using TensorSharp; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { public class Qwen3Model : ModelBase { @@ -131,6 +131,7 @@ private void PrecomputeConstants() private void InitKVCache(int maxSeqLen) { + _maxContextLength = maxSeqLen; int numKVHeads = Config.NumKVHeads; int headDim = Config.HeadDim; _kvCacheK = new Tensor[Config.NumLayers]; @@ -557,3 +558,4 @@ public override void Dispose() } } } + diff --git a/InferenceEngine/Models/Qwen35/ImageProcessor.cs b/TensorSharp.Models/Models/Qwen35/ImageProcessor.cs similarity index 98% rename from InferenceEngine/Models/Qwen35/ImageProcessor.cs rename to TensorSharp.Models/Models/Qwen35/ImageProcessor.cs index a197845..b49a3bc 100644 --- a/InferenceEngine/Models/Qwen35/ImageProcessor.cs +++ b/TensorSharp.Models/Models/Qwen35/ImageProcessor.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -10,7 +10,7 @@ using System; using System.IO; -namespace InferenceEngine +namespace TensorSharp.Models { public class Qwen35ImageProcessor { @@ -122,3 +122,4 @@ private static float[] PackChannelFirst(byte[] rgba, int width, int height) } } } + diff --git a/InferenceEngine/Models/Qwen35/Qwen35Model.cs b/TensorSharp.Models/Models/Qwen35/Qwen35Model.cs similarity index 99% rename from InferenceEngine/Models/Qwen35/Qwen35Model.cs rename to TensorSharp.Models/Models/Qwen35/Qwen35Model.cs index 6946991..9efcf5e 100644 --- a/InferenceEngine/Models/Qwen35/Qwen35Model.cs +++ b/TensorSharp.Models/Models/Qwen35/Qwen35Model.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using TensorSharp; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { /// /// Qwen3.5 hybrid model: alternates GatedDeltaNet (recurrent) and FullAttention layers. @@ -295,6 +295,7 @@ private void InitGDNBuffers() private void InitCaches(int maxSeqLen) { + _maxContextLength = maxSeqLen; int numLayers = Config.NumLayers; _kvCacheK = new Tensor[numLayers]; _kvCacheV = new Tensor[numLayers]; @@ -1017,3 +1018,4 @@ public override void Dispose() } } } + diff --git a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs b/TensorSharp.Models/Models/Qwen35/Qwen35VisionEncoder.cs similarity index 99% rename from InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs rename to TensorSharp.Models/Models/Qwen35/Qwen35VisionEncoder.cs index c4c21dc..243ec65 100644 --- a/InferenceEngine/Models/Qwen35/Qwen35VisionEncoder.cs +++ b/TensorSharp.Models/Models/Qwen35/Qwen35VisionEncoder.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using TensorSharp.Cpu; using TensorSharp.GGML; -namespace InferenceEngine +namespace TensorSharp.Models { public class Qwen35VisionEncoder : IDisposable { @@ -588,3 +588,4 @@ public void Dispose() } } } + diff --git a/InferenceEngine/NativeDequant.cs b/TensorSharp.Models/NativeDequant.cs similarity index 94% rename from InferenceEngine/NativeDequant.cs rename to TensorSharp.Models/NativeDequant.cs index 1b5ac30..5111769 100644 --- a/InferenceEngine/NativeDequant.cs +++ b/TensorSharp.Models/NativeDequant.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -9,7 +9,7 @@ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. using System; -namespace InferenceEngine +namespace TensorSharp.Models { internal static class NativeDequant { @@ -34,3 +34,4 @@ public static long RowSize(int ggmlType, long ne) } } } + diff --git a/InferenceEngine/InferenceEngine.csproj b/TensorSharp.Models/TensorSharp.Models.csproj similarity index 61% rename from InferenceEngine/InferenceEngine.csproj rename to TensorSharp.Models/TensorSharp.Models.csproj index 9d1967c..9fddcf3 100644 --- a/InferenceEngine/InferenceEngine.csproj +++ b/TensorSharp.Models/TensorSharp.Models.csproj @@ -4,13 +4,16 @@ true false bin\ + TensorSharp model architecture implementations and multimodal model utilities. + tensor;models;llm;multimodal - - + + + @@ -22,4 +25,14 @@ + + + True + + + + True + \ + + diff --git a/InferenceEngine/BpeTokenizer.cs b/TensorSharp.Runtime/BpeTokenizer.cs similarity index 96% rename from InferenceEngine/BpeTokenizer.cs rename to TensorSharp.Runtime/BpeTokenizer.cs index 8aa19ef..998a8bb 100644 --- a/InferenceEngine/BpeTokenizer.cs +++ b/TensorSharp.Runtime/BpeTokenizer.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using System.Text; using System.Text.RegularExpressions; -namespace InferenceEngine +namespace TensorSharp.Runtime { public interface ITokenizer { @@ -70,6 +70,10 @@ public BpeTokenizer(string[] vocab, int[] tokenTypes, string[] merges, @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|" + @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|" + @"\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+", + "tekken" => + @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|" + + @"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|" + + @"\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+", _ => @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+", }; @@ -324,3 +328,4 @@ private struct MergeNode } } } + diff --git a/InferenceEngine/ChatTemplate.cs b/TensorSharp.Runtime/ChatTemplate.cs similarity index 95% rename from InferenceEngine/ChatTemplate.cs rename to TensorSharp.Runtime/ChatTemplate.cs index 182e1c8..32f2aa8 100644 --- a/InferenceEngine/ChatTemplate.cs +++ b/TensorSharp.Runtime/ChatTemplate.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using System.Text; using System.Text.Json; -namespace InferenceEngine +namespace TensorSharp.Runtime { public class ChatMessage { @@ -321,6 +321,9 @@ public static string RenderFromGgufTemplate(string template, List m if (IsQwen35Family(architecture) && !enableThinking) return RenderHardcoded(messages, addGenerationPrompt, architecture, tools, enableThinking); + if (architecture == "mistral3") + return RenderHardcoded(messages, addGenerationPrompt, architecture, tools, enableThinking); + if (!string.IsNullOrWhiteSpace(template)) { try @@ -371,6 +374,9 @@ private static string RenderHardcoded(List messages, if (architecture == "nemotron_h" || architecture == "nemotron_h_moe") return RenderQwen3(messages, addGenerationPrompt, tools, enableThinking); + if (architecture == "mistral3") + return RenderMistral3(messages, addGenerationPrompt); + return RenderQwen3(messages, addGenerationPrompt, tools, enableThinking); } @@ -513,6 +519,12 @@ private static List InjectMultimodalTokens(List messag foreach (var _ in msg.ImagePaths) sb.Append("<|vision_start|><|image_pad|><|vision_end|>"); } + else if (architecture == "mistral3") + { + if (msg.ImagePaths != null) + foreach (var _ in msg.ImagePaths) + sb.Append("[IMG]"); + } sb.Append(msg.Content ?? ""); @@ -528,6 +540,42 @@ private static List InjectMultimodalTokens(List messag return result; } + /// + /// Render Mistral 3 chat template. + /// Uses [SYSTEM_PROMPT]...[/SYSTEM_PROMPT] for system messages + /// and [INST]...[/INST] for user messages. + /// + public static string RenderMistral3(List messages, bool addGenerationPrompt = true) + { + var sb = new StringBuilder(); + int startIdx = 0; + + if (messages.Count > 0 && messages[0].Role == "system") + { + sb.Append("[SYSTEM_PROMPT]"); + sb.Append(messages[0].Content); + sb.Append("[/SYSTEM_PROMPT]"); + startIdx = 1; + } + + for (int i = startIdx; i < messages.Count; i++) + { + var msg = messages[i]; + if (msg.Role == "user") + { + sb.Append("[INST]"); + sb.Append(msg.Content); + sb.Append("[/INST]"); + } + else if (msg.Role == "assistant") + { + sb.Append(msg.Content); + } + } + + return sb.ToString(); + } + /// /// Render GPT OSS / Harmony chat template. /// Matches the GGUF Jinja2 template: system message with model identity / date / channels, @@ -900,3 +948,4 @@ public static List ExpandGemma3ImageTokens(List tokens, int startOfIma } } } + diff --git a/TensorSharp.Runtime/Contracts.cs b/TensorSharp.Runtime/Contracts.cs new file mode 100644 index 0000000..edd7e61 --- /dev/null +++ b/TensorSharp.Runtime/Contracts.cs @@ -0,0 +1,66 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System; +using System.Collections.Generic; + +namespace TensorSharp.Runtime +{ + public interface IModelArchitecture : IDisposable + { + ModelConfig Config { get; } + ITokenizer Tokenizer { get; } + IKVCachePolicy KVCachePolicy { get; } + IMultimodalInjector MultimodalInjector { get; } + IBackendExecutionPlan ExecutionPlan { get; } + float[] Forward(int[] tokens); + void ResetKVCache(); + bool SupportsKVCacheTruncation { get; } + void TruncateKVCache(int tokenCount); + } + + public interface IPromptRenderer + { + string Render( + string template, + List messages, + bool addGenerationPrompt = true, + string architecture = null, + List tools = null, + bool enableThinking = false); + } + + public interface IOutputProtocolParser + { + void Init(bool enableThinking, List tools); + ParsedOutput Add(string text, bool done); + bool HasThinkingSupport { get; } + bool HasToolSupport { get; } + bool AlwaysRequired { get; } + } + + public interface IMultimodalInjector + { + void LoadProjectors(string mmProjPath); + List ProcessPromptTokens(List history, List inputTokens); + } + + public interface IKVCachePolicy + { + int ComputeReusablePrefix(IModelArchitecture model, List cachedTokens, List inputTokens, bool hasMultimodal); + } + + public interface IBackendExecutionPlan + { + BackendType BackendType { get; } + bool UsesGgmlBackend { get; } + bool ShouldStoreWeightQuantized(GgufTensorInfo info); + } +} + diff --git a/TensorSharp.Runtime/DefaultKvCachePolicy.cs b/TensorSharp.Runtime/DefaultKvCachePolicy.cs new file mode 100644 index 0000000..f3adf24 --- /dev/null +++ b/TensorSharp.Runtime/DefaultKvCachePolicy.cs @@ -0,0 +1,62 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System; +using System.Collections.Generic; + +namespace TensorSharp.Runtime +{ + public sealed class DefaultKvCachePolicy : IKVCachePolicy + { + public static DefaultKvCachePolicy Shared { get; } = new(); + + public int ComputeReusablePrefix(IModelArchitecture model, List cachedTokens, List inputTokens, bool hasMultimodal) + { + if (model == null || hasMultimodal || !model.SupportsKVCacheTruncation) + return 0; + if (cachedTokens == null || cachedTokens.Count == 0 || inputTokens == null || inputTokens.Count == 0) + return 0; + + int raw = FindCommonPrefix(cachedTokens, inputTokens); + if (raw <= 0) + return 0; + + int slidingWindow = model.Config?.SlidingWindow ?? 0; + if (slidingWindow > 0) + raw = Math.Max(0, raw - slidingWindow); + + if (raw < 4) + return 0; + + double savingsRatio = (double)raw / inputTokens.Count; + if (savingsRatio < 0.10) + return 0; + + return raw; + } + + private static int FindCommonPrefix(List cachedTokens, List inputTokens) + { + int maxLen = Math.Min(cachedTokens.Count, inputTokens.Count); + int prefix = 0; + for (int i = 0; i < maxLen; i++) + { + if (cachedTokens[i] != inputTokens[i]) + break; + prefix++; + } + + if (prefix == 0 || prefix >= inputTokens.Count) + return 0; + + return prefix; + } + } +} + diff --git a/InferenceEngine/GgufReader.cs b/TensorSharp.Runtime/GgufReader.cs similarity index 99% rename from InferenceEngine/GgufReader.cs rename to TensorSharp.Runtime/GgufReader.cs index bb0ffcf..c3a83ec 100644 --- a/InferenceEngine/GgufReader.cs +++ b/TensorSharp.Runtime/GgufReader.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using System.IO; using System.Text; -namespace InferenceEngine +namespace TensorSharp.Runtime { public enum GgufValueType : uint { @@ -468,3 +468,4 @@ public void Dispose() } } } + diff --git a/InferenceEngine/Jinja2Template.cs b/TensorSharp.Runtime/Jinja2Template.cs similarity index 99% rename from InferenceEngine/Jinja2Template.cs rename to TensorSharp.Runtime/Jinja2Template.cs index d646247..ac06f3c 100644 --- a/InferenceEngine/Jinja2Template.cs +++ b/TensorSharp.Runtime/Jinja2Template.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using System.Linq; using System.Text; -namespace InferenceEngine +namespace TensorSharp.Runtime { /// /// Minimal Jinja2 template renderer for LLM chat templates loaded from GGUF files. @@ -1490,3 +1490,4 @@ private static List SplitArgs(string s) #endregion } } + diff --git a/TensorSharp.Runtime/ModelPrimitives.cs b/TensorSharp.Runtime/ModelPrimitives.cs new file mode 100644 index 0000000..1b96471 --- /dev/null +++ b/TensorSharp.Runtime/ModelPrimitives.cs @@ -0,0 +1,44 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +namespace TensorSharp.Runtime +{ + public enum BackendType + { + Cpu, + GgmlCpu, + GgmlMetal, + GgmlCuda, + } + + public class ModelConfig + { + public string Architecture { get; set; } + public int HiddenSize { get; set; } + public int NumHeads { get; set; } + public int NumKVHeads { get; set; } + public int KeyLength { get; set; } + public int ValueLength { get; set; } + public float Eps { get; set; } + public float RopeBase { get; set; } + public float RopeScale { get; set; } = 1f; + public int NumLayers { get; set; } + public int VocabSize { get; set; } + public int IntermediateSize { get; set; } + public string ChatTemplate { get; set; } + + public int NumExperts { get; set; } + public int NumExpertsUsed { get; set; } + public int SlidingWindow { get; set; } + public int OriginalContextLength { get; set; } + + public int HeadDim => KeyLength > 0 ? KeyLength : (ValueLength > 0 ? ValueLength : HiddenSize / NumHeads); + } +} + diff --git a/InferenceEngine/OutputParser.cs b/TensorSharp.Runtime/OutputParser.cs similarity index 98% rename from InferenceEngine/OutputParser.cs rename to TensorSharp.Runtime/OutputParser.cs index efa471a..5ea8e8a 100644 --- a/InferenceEngine/OutputParser.cs +++ b/TensorSharp.Runtime/OutputParser.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -14,7 +14,7 @@ using System.Text.Json; using System.Text.RegularExpressions; -namespace InferenceEngine +namespace TensorSharp.Runtime { /// /// Represents a tool function definition provided to the model. @@ -64,17 +64,8 @@ public class ParsedOutput /// Streaming parser that extracts thinking content, regular content, and tool calls /// from model output. Handles model-specific tag formats. /// - public interface IOutputParser + public interface IOutputParser : IOutputProtocolParser { - void Init(bool enableThinking, List tools); - ParsedOutput Add(string text, bool done); - bool HasThinkingSupport { get; } - bool HasToolSupport { get; } - /// - /// True when the model's wire format always requires parsing (e.g. Harmony - /// framing), even if the caller did not request thinking or tool support. - /// - bool AlwaysRequired { get; } } // ======================================================================== @@ -809,3 +800,4 @@ public static bool IsAlwaysRequired(string architecture) } } } + diff --git a/TensorSharp.Runtime/PromptRenderer.cs b/TensorSharp.Runtime/PromptRenderer.cs new file mode 100644 index 0000000..9900ae0 --- /dev/null +++ b/TensorSharp.Runtime/PromptRenderer.cs @@ -0,0 +1,34 @@ +// Copyright (c) Zhongkai Fu. All rights reserved. +// https://github.com/zhongkaifu/TensorSharp +// +// This file is part of TensorSharp. +// +// TensorSharp is licensed under the BSD-3-Clause license found in the LICENSE file in the root directory of this source tree. +// +// TensorSharp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. +using System.Collections.Generic; + +namespace TensorSharp.Runtime +{ + public sealed class GgufPromptRenderer : IPromptRenderer + { + public string Render( + string template, + List messages, + bool addGenerationPrompt = true, + string architecture = null, + List tools = null, + bool enableThinking = false) + { + return ChatTemplate.RenderFromGgufTemplate( + template, + messages, + addGenerationPrompt, + architecture, + tools, + enableThinking); + } + } +} + diff --git a/InferenceEngine/SamplingConfig.cs b/TensorSharp.Runtime/SamplingConfig.cs similarity index 98% rename from InferenceEngine/SamplingConfig.cs rename to TensorSharp.Runtime/SamplingConfig.cs index 6b688cb..b719629 100644 --- a/InferenceEngine/SamplingConfig.cs +++ b/TensorSharp.Runtime/SamplingConfig.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -9,7 +9,7 @@ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the BSD-3-Clause License for more details. using System.Collections.Generic; -namespace InferenceEngine +namespace TensorSharp.Runtime { /// /// Configuration for token sampling during inference. @@ -113,3 +113,4 @@ public class SamplingConfig }; } } + diff --git a/InferenceEngine/SentencePieceTokenizer.cs b/TensorSharp.Runtime/SentencePieceTokenizer.cs similarity index 99% rename from InferenceEngine/SentencePieceTokenizer.cs rename to TensorSharp.Runtime/SentencePieceTokenizer.cs index e806034..9030e3d 100644 --- a/InferenceEngine/SentencePieceTokenizer.cs +++ b/TensorSharp.Runtime/SentencePieceTokenizer.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -12,7 +12,7 @@ using System.Linq; using System.Text; -namespace InferenceEngine +namespace TensorSharp.Runtime { /// /// SentencePiece unigram tokenizer matching ollama's implementation. @@ -318,3 +318,4 @@ public int Compare((float score, int a, int b) x, (float score, int a, int b) y) } } } + diff --git a/InferenceEngine/StructuredOutputs.cs b/TensorSharp.Runtime/StructuredOutputs.cs similarity index 99% rename from InferenceEngine/StructuredOutputs.cs rename to TensorSharp.Runtime/StructuredOutputs.cs index 74bc45a..516063e 100644 --- a/InferenceEngine/StructuredOutputs.cs +++ b/TensorSharp.Runtime/StructuredOutputs.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -15,7 +15,7 @@ using System.Text.Json.Nodes; using System.Text.RegularExpressions; -namespace InferenceEngine +namespace TensorSharp.Runtime { public enum StructuredOutputKind { @@ -1024,3 +1024,4 @@ public SchemaValidationContext(JsonElement rootSchema, List errors) } } } + diff --git a/TensorSharp.Runtime/TensorSharp.Runtime.csproj b/TensorSharp.Runtime/TensorSharp.Runtime.csproj new file mode 100644 index 0000000..38aa5c6 --- /dev/null +++ b/TensorSharp.Runtime/TensorSharp.Runtime.csproj @@ -0,0 +1,20 @@ + + + net10.0 + true + false + bin\ + TensorSharp runtime services: GGUF parsing, tokenizers, prompt templates, sampling, and output parsing. + tensor;runtime;gguf;tokenizer;sampling + + + + True + + + + True + \ + + + diff --git a/InferenceEngine/TokenSampler.cs b/TensorSharp.Runtime/TokenSampler.cs similarity index 99% rename from InferenceEngine/TokenSampler.cs rename to TensorSharp.Runtime/TokenSampler.cs index 99ddafa..9eb3863 100644 --- a/InferenceEngine/TokenSampler.cs +++ b/TensorSharp.Runtime/TokenSampler.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -10,7 +10,7 @@ using System; using System.Collections.Generic; -namespace InferenceEngine +namespace TensorSharp.Runtime { /// /// Token sampler supporting temperature, top-k, top-p (nucleus), min-p, @@ -339,3 +339,4 @@ private static int Argmax(float[] values) #endregion } } + diff --git a/InferenceWeb/API_EXAMPLES.md b/TensorSharp.Server/API_EXAMPLES.md similarity index 97% rename from InferenceWeb/API_EXAMPLES.md rename to TensorSharp.Server/API_EXAMPLES.md index 74118c5..02ad384 100644 --- a/InferenceWeb/API_EXAMPLES.md +++ b/TensorSharp.Server/API_EXAMPLES.md @@ -1,6 +1,6 @@ -# InferenceWeb API Examples +# TensorSharp.Server API Examples -InferenceWeb provides three API styles: +TensorSharp.Server provides three API styles: - **Ollama-compatible** (`/api/generate`, `/api/chat/ollama`, `/api/tags`, `/api/show`) - **OpenAI-compatible** (`/v1/chat/completions`, `/v1/models`) - **Web UI** (`/api/chat`, `/api/models`, `/api/models/load`) @@ -10,7 +10,7 @@ For the Web UI flow, choose the model up front with `/api/models/load`. The `/ap ## Starting the Server ```bash -MODEL_DIR=~/work/model BACKEND=ggml_metal ./InferenceWeb +MODEL_DIR=~/work/model BACKEND=ggml_metal ./TensorSharp.Server ``` The server starts on `http://localhost:5000`. @@ -303,7 +303,7 @@ Response: ### Chat Completions with Structured Outputs (`json_schema`) -InferenceWeb accepts the OpenAI Chat Completions `response_format` shape, injects strict JSON instructions into the prompt, and validates the final output before returning it. +TensorSharp.Server accepts the OpenAI Chat Completions `response_format` shape, injects strict JSON instructions into the prompt, and validates the final output before returning it. ```bash curl -X POST http://localhost:5000/v1/chat/completions \ @@ -556,3 +556,4 @@ while IFS= read -r line; do echo -e "\n" done < test_requests.jsonl ``` + diff --git a/InferenceWeb/BackendCatalog.cs b/TensorSharp.Server/BackendCatalog.cs similarity index 95% rename from InferenceWeb/BackendCatalog.cs rename to TensorSharp.Server/BackendCatalog.cs index 46aa8c9..62aa874 100644 --- a/InferenceWeb/BackendCatalog.cs +++ b/TensorSharp.Server/BackendCatalog.cs @@ -1,16 +1,15 @@ -using System; +using System; using System.Collections.Generic; using System.Linq; -using InferenceEngine; using TensorSharp.GGML; -namespace InferenceWeb +namespace TensorSharp.Server { internal sealed record BackendOption(string Value, string Label); internal static class BackendCatalog { - // InferenceWeb should always expose the two CPU choices distinctly: + // TensorSharp.Server should always expose the two CPU choices distinctly: // `ggml_cpu` is the native GGML CPU backend, while `cpu` is the pure C# backend. private static readonly BackendDescriptor[] BackendDescriptors = { @@ -86,3 +85,6 @@ private static bool IsGgmlBackendAvailable(GgmlBackendType backendType) private sealed record BackendDescriptor(string Value, string Label, GgmlBackendType GgmlBackendType, bool AlwaysAvailable); } } + + + diff --git a/TensorSharp.Server/GlobalUsings.cs b/TensorSharp.Server/GlobalUsings.cs new file mode 100644 index 0000000..1f7dd10 --- /dev/null +++ b/TensorSharp.Server/GlobalUsings.cs @@ -0,0 +1,3 @@ +global using TensorSharp.Models; +global using TensorSharp.Runtime; +global using TensorSharp.Server; diff --git a/InferenceWeb/InferenceQueue.cs b/TensorSharp.Server/InferenceQueue.cs similarity index 98% rename from InferenceWeb/InferenceQueue.cs rename to TensorSharp.Server/InferenceQueue.cs index d69f405..f8d6f0d 100644 --- a/InferenceWeb/InferenceQueue.cs +++ b/TensorSharp.Server/InferenceQueue.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -13,7 +13,7 @@ using System.Threading; using System.Threading.Tasks; -namespace InferenceWeb +namespace TensorSharp.Server { /// /// FIFO request queue that ensures only one inference runs at a time, @@ -206,3 +206,4 @@ public void Dispose() } } } + diff --git a/InferenceWeb/ModelService.cs b/TensorSharp.Server/ModelService.cs similarity index 88% rename from InferenceWeb/ModelService.cs rename to TensorSharp.Server/ModelService.cs index 4f8ddb1..b5ae0cc 100644 --- a/InferenceWeb/ModelService.cs +++ b/TensorSharp.Server/ModelService.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -16,14 +16,14 @@ using System.Text; using System.Threading; using System.Threading.Tasks; -using InferenceEngine; using TensorSharp; using TensorSharp.Cpu; -namespace InferenceWeb +namespace TensorSharp.Server { public class ModelService : IDisposable { + private readonly IPromptRenderer _promptRenderer = new GgufPromptRenderer(); private ModelBase _model; private string _loadedModelPath; private string _loadedMmProjPath; @@ -34,6 +34,7 @@ public class ModelService : IDisposable public bool IsLoaded => _model != null; public string LoadedModelName => _loadedModelPath != null ? Path.GetFileName(_loadedModelPath) : null; public string LoadedModelPath => _loadedModelPath; + public string LoadedMmProjName => _loadedMmProjPath != null ? Path.GetFileName(_loadedMmProjPath) : null; public string LoadedBackend => _model != null ? BackendCatalog.ToBackendValue(_backend) : null; public string Architecture => _model?.Config?.Architecture; public ModelBase Model => _model; @@ -54,6 +55,7 @@ public void InvalidateKVCache() /// /// Load a model. Must be called within the InferenceQueue to prevent concurrent access. + /// When mmProjPath is null, auto-detection is used. Pass empty string to skip mmproj loading. /// public void LoadModel(string modelPath, string mmProjPath, string backendStr) { @@ -78,7 +80,7 @@ public void LoadModel(string modelPath, string mmProjPath, string backendStr) if (mmProjPath == null) mmProjPath = AutoDetectMmProj(modelPath); - if (mmProjPath != null && File.Exists(mmProjPath)) + if (!string.IsNullOrEmpty(mmProjPath) && File.Exists(mmProjPath)) { LoadEncoders(mmProjPath); _loadedMmProjPath = mmProjPath; @@ -113,19 +115,7 @@ private string AutoDetectMmProj(string modelPath) private void LoadEncoders(string mmProjPath) { - switch (_model) - { - case Gemma4Model g4: - g4.LoadVisionEncoder(mmProjPath); - g4.LoadAudioEncoder(mmProjPath); - break; - case Gemma3Model g3: - g3.LoadVisionEncoder(mmProjPath); - break; - case Qwen35Model q35: - q35.LoadVisionEncoder(mmProjPath); - break; - } + _model?.MultimodalInjector.LoadProjectors(mmProjPath); } /// @@ -141,7 +131,7 @@ public async IAsyncEnumerable ChatStreamAsync( { string arch = _model.Config.Architecture; var preparedHistory = PrepareHistoryForInference(history, arch); - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = _promptRenderer.Render( _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true, architecture: arch, tools: tools, enableThinking: enableThinking); @@ -151,7 +141,21 @@ public async IAsyncEnumerable ChatStreamAsync( bool hasMultimodal = HasMultimodalContent(preparedHistory); var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true); - inputTokens = ProcessMultimodalHistory(preparedHistory, inputTokens, arch); + inputTokens = _model.MultimodalInjector.ProcessPromptTokens(preparedHistory, inputTokens); + + int maxCtx = _model.MaxContextLength; + if (maxCtx > 0 && inputTokens.Count + maxTokens > maxCtx) + { + int available = maxCtx - maxTokens; + if (available < 1) + throw new InvalidOperationException( + $"Prompt ({inputTokens.Count} tokens) exceeds the model's context limit ({maxCtx} tokens). " + + "Please shorten the input or reduce attached file size."); + + Console.WriteLine($"[Context] Truncating prompt from {inputTokens.Count} to {available} tokens (context limit {maxCtx}, reserving {maxTokens} for generation)"); + inputTokens = inputTokens.GetRange(inputTokens.Count - available, available); + _cachedTokens = null; + } float[] logits; int commonPrefix = ComputeUsablePrefix(inputTokens, hasMultimodal); @@ -468,12 +472,25 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB }; var preparedMessages = PrepareHistoryForInference(messages, arch); - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = _promptRenderer.Render( _model.Config.ChatTemplate, preparedMessages, addGenerationPrompt: true, architecture: arch); var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true); - inputTokens = ProcessMultimodalHistory(preparedMessages, inputTokens, arch); + inputTokens = _model.MultimodalInjector.ProcessPromptTokens(preparedMessages, inputTokens); + + int maxCtx = _model.MaxContextLength; + if (maxCtx > 0 && inputTokens.Count + maxTokens > maxCtx) + { + int available = maxCtx - maxTokens; + if (available < 1) + throw new InvalidOperationException( + $"Prompt ({inputTokens.Count} tokens) exceeds the model's context limit ({maxCtx} tokens). " + + "Please shorten the input or reduce attached file size."); + + Console.WriteLine($"[Context] Truncating prompt from {inputTokens.Count} to {available} tokens (context limit {maxCtx}, reserving {maxTokens} for generation)"); + inputTokens = inputTokens.GetRange(inputTokens.Count - available, available); + } InvalidateKVCache(); @@ -534,7 +551,7 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB { string arch = _model.Config.Architecture; var preparedHistory = PrepareHistoryForInference(history, arch); - string rendered = ChatTemplate.RenderFromGgufTemplate( + string rendered = _promptRenderer.Render( _model.Config.ChatTemplate, preparedHistory, addGenerationPrompt: true, architecture: arch, tools: tools, enableThinking: enableThinking); @@ -544,7 +561,21 @@ public bool EnsureModelLoaded(string modelName, string modelDir, string defaultB bool hasMultimodal = HasMultimodalContent(preparedHistory); var inputTokens = _model.Tokenizer.Encode(rendered, addSpecial: true); - inputTokens = ProcessMultimodalHistory(preparedHistory, inputTokens, arch); + inputTokens = _model.MultimodalInjector.ProcessPromptTokens(preparedHistory, inputTokens); + + int maxCtx = _model.MaxContextLength; + if (maxCtx > 0 && inputTokens.Count + maxTokens > maxCtx) + { + int available = maxCtx - maxTokens; + if (available < 1) + throw new InvalidOperationException( + $"Prompt ({inputTokens.Count} tokens) exceeds the model's context limit ({maxCtx} tokens). " + + "Please shorten the input or reduce attached file size."); + + Console.WriteLine($"[Context] Truncating prompt from {inputTokens.Count} to {available} tokens (context limit {maxCtx}, reserving {maxTokens} for generation)"); + inputTokens = inputTokens.GetRange(inputTokens.Count - available, available); + _cachedTokens = null; + } int promptTokenCount; var sw = Stopwatch.StartNew(); @@ -648,44 +679,7 @@ private static int FindValidUtf8Length(List bytes) /// private int ComputeUsablePrefix(List inputTokens, bool hasMultimodal) { - if (hasMultimodal || !_model.SupportsKVCacheTruncation) - return 0; - - int raw = FindTokenPrefixLength(_cachedTokens, inputTokens); - if (raw <= 0) - return 0; - - // For SWA models, back up by slidingWindow so the suffix re-processes - // enough tokens to rebuild the SWA ring buffer with fresh, ordered K/V. - int swa = _model.Config.SlidingWindow; - if (swa > 0) - { - int backed = Math.Max(0, raw - swa); - Console.WriteLine($"[KV cache] SWA back-up: raw prefix {raw} → {backed} (window={swa})"); - raw = backed; - } - - if (raw < 4) - { - Console.WriteLine($"[KV cache] Common prefix too short ({raw} tokens), doing full reset"); - return 0; - } - - double savingsRatio = (double)raw / inputTokens.Count; - if (savingsRatio < 0.10) - { - Console.WriteLine($"[KV cache] Savings too small ({raw}/{inputTokens.Count} = {100 * savingsRatio:F0}%), doing full reset"); - return 0; - } - - if (_cachedTokens != null && raw < _cachedTokens.Count) - { - string cachedTokStr = _cachedTokens.Count > raw ? _cachedTokens[raw].ToString() : "N/A"; - string newTokStr = inputTokens.Count > raw ? inputTokens[raw].ToString() : "N/A"; - Console.WriteLine($"[KV cache] Divergence at index {raw}: cached={cachedTokStr}, new={newTokStr} (cached total={_cachedTokens.Count}, new total={inputTokens.Count})"); - } - - return raw; + return _model?.KVCachePolicy.ComputeReusablePrefix(_model, _cachedTokens, inputTokens, hasMultimodal) ?? 0; } /// @@ -809,10 +803,26 @@ public List ScanModels(string directory) if (!Directory.Exists(directory)) return new List(); return Directory.GetFiles(directory, "*.gguf") .Select(Path.GetFileName) + .Where(f => !IsMmProjFile(f)) + .OrderBy(f => f) + .ToList(); + } + + public List ScanMmProjModels(string directory) + { + if (!Directory.Exists(directory)) return new List(); + return Directory.GetFiles(directory, "*.gguf") + .Select(Path.GetFileName) + .Where(IsMmProjFile) .OrderBy(f => f) .ToList(); } + private static bool IsMmProjFile(string fileName) + { + return fileName.IndexOf("mmproj", StringComparison.OrdinalIgnoreCase) >= 0; + } + public void Dispose() { _model?.Dispose(); @@ -820,3 +830,5 @@ public void Dispose() } } } + + diff --git a/InferenceWeb/OpenAIResponseFormatParser.cs b/TensorSharp.Server/OpenAIResponseFormatParser.cs similarity index 97% rename from InferenceWeb/OpenAIResponseFormatParser.cs rename to TensorSharp.Server/OpenAIResponseFormatParser.cs index e1215b5..eca3956 100644 --- a/InferenceWeb/OpenAIResponseFormatParser.cs +++ b/TensorSharp.Server/OpenAIResponseFormatParser.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -10,9 +10,8 @@ using System; using System.Text.Json; -using InferenceEngine; -namespace InferenceWeb +namespace TensorSharp.Server { public static class OpenAIResponseFormatParser { @@ -108,3 +107,5 @@ strictEl.ValueKind is JsonValueKind.True or JsonValueKind.False && } } } + + diff --git a/InferenceWeb/Program.cs b/TensorSharp.Server/Program.cs similarity index 96% rename from InferenceWeb/Program.cs rename to TensorSharp.Server/Program.cs index 7a20a2c..3bd0ec5 100644 --- a/InferenceWeb/Program.cs +++ b/TensorSharp.Server/Program.cs @@ -1,4 +1,4 @@ -// Copyright (c) Zhongkai Fu. All rights reserved. +// Copyright (c) Zhongkai Fu. All rights reserved. // https://github.com/zhongkaifu/TensorSharp // // This file is part of TensorSharp. @@ -14,8 +14,6 @@ using System.Linq; using System.Text; using System.Text.Json; -using InferenceEngine; -using InferenceWeb; using Microsoft.AspNetCore.Builder; using Microsoft.AspNetCore.Hosting; using Microsoft.AspNetCore.Http; @@ -64,6 +62,11 @@ RequestPath = "/uploads" }); +int maxTextFileChars = 8000; +string maxTextEnv = Environment.GetEnvironmentVariable("MAX_TEXT_FILE_CHARS"); +if (!string.IsNullOrEmpty(maxTextEnv) && int.TryParse(maxTextEnv, out int envMax) && envMax > 0) + maxTextFileChars = envMax; + string modelDir = Environment.GetEnvironmentVariable("MODEL_DIR") ?? Path.Combine(AppContext.BaseDirectory, "models"); string configuredBackend = Environment.GetEnvironmentVariable("BACKEND") @@ -123,10 +126,13 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack app.MapGet("/api/models", (ModelService svc) => { var files = svc.ScanModels(modelDir); + var mmProjFiles = svc.ScanMmProjModels(modelDir); return Results.Json(new { models = files, + mmProjModels = mmProjFiles, loaded = svc.LoadedModelName, + loadedMmProj = svc.LoadedMmProjName, loadedBackend = svc.LoadedBackend, defaultBackend, supportedBackends, @@ -149,7 +155,23 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack if (!File.Exists(modelPath)) return Results.NotFound(new { error = $"Model not found: {modelName}" }); - string mmProjPath = mmproj != null ? Path.Combine(modelDir, mmproj) : null; + // mmproj handling: + // null/absent -> auto-detect (ModelService default) + // ""/"none" -> explicitly no mmproj (pass empty string to skip auto-detect) + // "filename" -> use that specific mmproj file + string mmProjPath; + if (mmproj == null) + { + mmProjPath = null; // auto-detect + } + else if (string.IsNullOrWhiteSpace(mmproj) || string.Equals(mmproj, "none", StringComparison.OrdinalIgnoreCase)) + { + mmProjPath = ""; // explicit skip + } + else + { + mmProjPath = Path.Combine(modelDir, mmproj); + } using var ticket = queue.Enqueue(ctx.RequestAborted); await ticket.WaitUntilReadyAsync(); @@ -161,6 +183,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack { ok = true, model = svc.LoadedModelName, + loadedMmProj = svc.LoadedMmProjName, architecture = svc.Architecture }); } @@ -192,6 +215,11 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack ".png" or ".jpg" or ".jpeg" or ".gif" or ".webp" or ".bmp" => "image", ".mp4" or ".mov" or ".avi" or ".mkv" or ".webm" => "video", ".mp3" or ".wav" or ".ogg" or ".flac" or ".m4a" => "audio", + ".txt" or ".csv" or ".json" or ".xml" or ".md" or ".log" + or ".py" or ".js" or ".ts" or ".cs" or ".java" or ".cpp" or ".c" or ".h" + or ".html" or ".css" or ".yaml" or ".yml" or ".toml" or ".ini" or ".cfg" + or ".sh" or ".bat" or ".ps1" or ".rb" or ".go" or ".rs" or ".swift" + or ".kt" or ".sql" or ".r" or ".m" or ".tex" or ".rtf" => "text", _ => "unknown" }; @@ -209,6 +237,18 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack }); } + if (mediaType == "text") + { + string textContent = await File.ReadAllTextAsync(savePath); + bool truncated = false; + if (textContent.Length > maxTextFileChars) + { + textContent = textContent.Substring(0, maxTextFileChars); + truncated = true; + } + return Results.Json(new { ok = true, path = savePath, mediaType, fileName = file.FileName, textContent, truncated }); + } + return Results.Json(new { ok = true, path = savePath, mediaType, fileName = file.FileName }); }); @@ -294,6 +334,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack } bool aborted = false; + string inferenceError = null; try { await foreach (var piece in svc.ChatStreamAsync(messages, maxTokens, ctx.RequestAborted, samplingConfig, @@ -338,6 +379,11 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack } } catch (OperationCanceledException) { aborted = true; } + catch (Exception ex) + { + Console.Error.WriteLine($"[Chat error] {ex.Message}"); + inferenceError = ex.Message; + } try { @@ -374,7 +420,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack sw.Stop(); double tokPerSec = tokenCount > 0 ? tokenCount / sw.Elapsed.TotalSeconds : 0; string done = JsonSerializer.Serialize(new { done = true, tokenCount, elapsed = sw.Elapsed.TotalSeconds, tokPerSec, - aborted }); + aborted, error = inferenceError }); await ctx.Response.WriteAsync($"data: {done}\n\n"); await ctx.Response.Body.FlushAsync(); } @@ -385,7 +431,7 @@ bool TryResolveSupportedBackend(string requestedBackend, out string resolvedBack // Ollama-compatible API endpoints // ============================================================ -app.MapGet("/", () => Results.Ok("InferenceWeb is running")); +app.MapGet("/", () => Results.Ok("TensorSharp.Server is running")); app.MapGet("/api/version", () => Results.Json(new { version = "0.1.0" })); app.MapGet("/api/tags", (ModelService svc) => @@ -1604,7 +1650,7 @@ static string ResolveModelPath(string modelName, string modelDir) Console.WriteLine($"Model directory: {modelDir}"); Console.WriteLine($"Video max frames: {MediaHelper.GetConfiguredMaxVideoFrames()}"); -Console.WriteLine("Starting InferenceWeb on http://localhost:5000"); +Console.WriteLine("Starting TensorSharp.Server on http://localhost:5000"); Console.WriteLine("API endpoints:"); Console.WriteLine(" GET / - Health check"); Console.WriteLine(" GET /api/tags - List available models (Ollama)"); @@ -1617,3 +1663,6 @@ static string ResolveModelPath(string modelName, string modelDir) Console.WriteLine(" POST /api/models/load - Load model (Web UI)"); Console.WriteLine(" GET /api/models - List models (Web UI)"); app.Run("http://0.0.0.0:5000"); + + + diff --git a/InferenceWeb/InferenceWeb.csproj b/TensorSharp.Server/TensorSharp.Server.csproj similarity index 76% rename from InferenceWeb/InferenceWeb.csproj rename to TensorSharp.Server/TensorSharp.Server.csproj index efcbddb..a9dc841 100644 --- a/InferenceWeb/InferenceWeb.csproj +++ b/TensorSharp.Server/TensorSharp.Server.csproj @@ -4,6 +4,7 @@ true false bin\ + HTTP and web server host for TensorSharp runtime and model services. @@ -14,7 +15,9 @@ libGgmlOps.so - + + + diff --git a/InferenceWeb/WebUiChatPolicy.cs b/TensorSharp.Server/WebUiChatPolicy.cs similarity index 94% rename from InferenceWeb/WebUiChatPolicy.cs rename to TensorSharp.Server/WebUiChatPolicy.cs index 4713c28..3be4823 100644 --- a/InferenceWeb/WebUiChatPolicy.cs +++ b/TensorSharp.Server/WebUiChatPolicy.cs @@ -1,4 +1,4 @@ -namespace InferenceWeb; +namespace TensorSharp.Server; internal static class WebUiChatPolicy { @@ -17,3 +17,4 @@ public static bool TryValidateChatRequest(string requestedModel, string requeste return false; } } + diff --git a/InferenceWeb/test_requests.jsonl b/TensorSharp.Server/test_requests.jsonl similarity index 100% rename from InferenceWeb/test_requests.jsonl rename to TensorSharp.Server/test_requests.jsonl diff --git a/InferenceWeb/testdata/README.md b/TensorSharp.Server/testdata/README.md similarity index 94% rename from InferenceWeb/testdata/README.md rename to TensorSharp.Server/testdata/README.md index 7c83b28..f290d05 100644 --- a/InferenceWeb/testdata/README.md +++ b/TensorSharp.Server/testdata/README.md @@ -1,14 +1,14 @@ -# InferenceWeb Multi-Turn Chat Integration Tests +# TensorSharp.Server Multi-Turn Chat Integration Tests -Two test suites that simulate real users having long multi-turn conversations with InferenceWeb across all API surfaces. +Two test suites that simulate real users having long multi-turn conversations with TensorSharp.Server across all API surfaces. For the Web UI flow, the tests load a model once through `/api/models/load` and then send turns to `/api/chat` without per-request model switching. ## Quick Start -1. Start InferenceWeb: +1. Start TensorSharp.Server: ```bash -MODEL_DIR=~/models BACKEND=ggml_metal ./InferenceWeb +MODEL_DIR=~/models BACKEND=ggml_metal ./TensorSharp.Server ``` 2. Run tests (pick one): @@ -75,3 +75,4 @@ python3 test_multiturn.py --max-tokens 120 # longer respons - **Structured outputs**: OpenAI-style `response_format` schemas are validated and enforced - **Abort support**: Mid-generation cancellation works and releases the queue - **Metrics**: Timing and token count metrics are present in done events + diff --git a/InferenceWeb/testdata/test_multiturn.py b/TensorSharp.Server/testdata/test_multiturn.py similarity index 99% rename from InferenceWeb/testdata/test_multiturn.py rename to TensorSharp.Server/testdata/test_multiturn.py index 9887ee8..5a16933 100644 --- a/InferenceWeb/testdata/test_multiturn.py +++ b/TensorSharp.Server/testdata/test_multiturn.py @@ -1,6 +1,6 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python3 """ -Multi-Turn Chat Integration Tests for InferenceWeb +Multi-Turn Chat Integration Tests for TensorSharp.Server Runs comprehensive multi-turn conversation tests against all API surfaces, validating response structure, context retention, and edge cases. @@ -619,7 +619,7 @@ def test_queue_status(self): # Run all tests # ========================================================================= def run_all(self): - self.header("InferenceWeb Multi-Turn Integration Tests (Python)") + self.header("TensorSharp.Server Multi-Turn Integration Tests (Python)") self.log(f"Server: {self.base_url}") self.log(f"Model: {self.model}") @@ -673,9 +673,9 @@ def run_all(self): def main(): - parser = argparse.ArgumentParser(description="Multi-turn chat integration tests for InferenceWeb") + parser = argparse.ArgumentParser(description="Multi-turn chat integration tests for TensorSharp.Server") parser.add_argument("--model", type=str, default=None, help="Model filename (auto-detected if omitted)") - parser.add_argument("--url", type=str, default="http://localhost:5000", help="InferenceWeb base URL") + parser.add_argument("--url", type=str, default="http://localhost:5000", help="TensorSharp.Server base URL") parser.add_argument("--max-tokens", type=int, default=80, help="Max tokens per response") args = parser.parse_args() @@ -701,3 +701,4 @@ def main(): if __name__ == "__main__": main() + diff --git a/InferenceWeb/testdata/test_multiturn.sh b/TensorSharp.Server/testdata/test_multiturn.sh similarity index 99% rename from InferenceWeb/testdata/test_multiturn.sh rename to TensorSharp.Server/testdata/test_multiturn.sh index a222f08..3618e79 100644 --- a/InferenceWeb/testdata/test_multiturn.sh +++ b/TensorSharp.Server/testdata/test_multiturn.sh @@ -1,6 +1,6 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash # ============================================================================= -# Multi-Turn Chat Integration Tests for InferenceWeb +# Multi-Turn Chat Integration Tests for TensorSharp.Server # # Tests long multi-turn conversations across all API surfaces: # 1. Web UI SSE API (/api/chat) @@ -8,7 +8,7 @@ # 3. OpenAI-compatible API (/v1/chat/completions) # # Prerequisites: -# - InferenceWeb running on localhost:5000 +# - TensorSharp.Server running on localhost:5000 # - At least one .gguf model available in MODEL_DIR # - curl and jq installed # @@ -813,7 +813,7 @@ test_abort_generation() { # ============================================================================= main() { - header "InferenceWeb Multi-Turn Chat Integration Tests" + header "TensorSharp.Server Multi-Turn Chat Integration Tests" check_deps wait_for_server auto_detect_model @@ -850,3 +850,4 @@ main() { } main + diff --git a/InferenceWeb/wwwroot/images/assistant_logo.png b/TensorSharp.Server/wwwroot/images/assistant_logo.png similarity index 100% rename from InferenceWeb/wwwroot/images/assistant_logo.png rename to TensorSharp.Server/wwwroot/images/assistant_logo.png diff --git a/InferenceWeb/wwwroot/images/banner_1.png b/TensorSharp.Server/wwwroot/images/banner_1.png similarity index 52% rename from InferenceWeb/wwwroot/images/banner_1.png rename to TensorSharp.Server/wwwroot/images/banner_1.png index 6a4a87e..beffe29 100644 Binary files a/InferenceWeb/wwwroot/images/banner_1.png and b/TensorSharp.Server/wwwroot/images/banner_1.png differ diff --git a/InferenceWeb/wwwroot/index.html b/TensorSharp.Server/wwwroot/index.html similarity index 92% rename from InferenceWeb/wwwroot/index.html rename to TensorSharp.Server/wwwroot/index.html index ab5a434..42b1c05 100644 --- a/InferenceWeb/wwwroot/index.html +++ b/TensorSharp.Server/wwwroot/index.html @@ -781,7 +781,7 @@

-

Load a model and start chatting. You can attach images, videos, and audio files for multimodal inference.

+

Load a model and start chatting. You can attach images, videos, audio, and text files for multimodal inference.

@@ -808,7 +808,7 @@

- + @@ -819,6 +819,12 @@

Load Model

+
+ + +