zhongkaifu · zhongkaifu · Apr 13, 2026 · Apr 12, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/InferenceWeb.Tests/BackendCatalogTests.cs b/InferenceWeb.Tests/BackendCatalogTests.cs
@@ -1,6 +1,4 @@
-using InferenceEngine;
-using InferenceWeb;
-using TensorSharp.GGML;
+using TensorSharp.GGML;
 
 namespace InferenceWeb.Tests;
 
@@ -121,3 +119,5 @@ public void ShouldStoreWeightQuantized_GgmlBackendsKeepQuantizedWeights()
         Assert.True(shouldStoreQuantized);
     }
 }
+
+
diff --git a/InferenceWeb.Tests/GlobalUsings.cs b/InferenceWeb.Tests/GlobalUsings.cs
@@ -0,0 +1,3 @@
+global using TensorSharp.Models;
+global using TensorSharp.Runtime;
+global using TensorSharp.Server;
diff --git a/InferenceWeb.Tests/ImageProcessorTests.cs b/InferenceWeb.Tests/ImageProcessorTests.cs
@@ -1,5 +1,4 @@
-using InferenceEngine;
-
+
 namespace InferenceWeb.Tests;
 
 public class ImageProcessorTests
@@ -95,3 +94,4 @@ private static string WriteEmbeddedJpeg()
         return path;
     }
 }
+
diff --git a/InferenceWeb.Tests/InferenceWeb.Tests.csproj b/InferenceWeb.Tests/InferenceWeb.Tests.csproj
@@ -15,7 +15,8 @@
     <Using Include="Xunit" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="..\InferenceEngine\InferenceEngine.csproj" />
-    <ProjectReference Include="..\InferenceWeb\InferenceWeb.csproj" />
+    <ProjectReference Include="..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
+    <ProjectReference Include="..\TensorSharp.Models\TensorSharp.Models.csproj" />
+    <ProjectReference Include="..\TensorSharp.Server\TensorSharp.Server.csproj" />
   </ItemGroup>
 </Project>
diff --git a/InferenceWeb.Tests/KVCacheTests.cs b/InferenceWeb.Tests/KVCacheTests.cs
@@ -1,5 +1,4 @@
-using InferenceWeb;
-
+
 namespace InferenceWeb.Tests;
 
 public class KVCacheTests
@@ -129,3 +128,4 @@ public void FindTokenPrefixLength_ThinkingModelWithContentInContext()
         Assert.Equal(8, common); // Full cached is prefix
     }
 }
+
diff --git a/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs b/InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
@@ -1,5 +1,4 @@
-using System.Buffers.Binary;
-using InferenceEngine;
+using System.Buffers.Binary;
 
 namespace InferenceWeb.Tests;
 
@@ -168,3 +167,4 @@ private static float Dot(float[] lhs, float[] rhs, int rhsOffset, int length)
         return sum;
     }
 }
+
diff --git a/InferenceWeb.Tests/MediaHelperTests.cs b/InferenceWeb.Tests/MediaHelperTests.cs
@@ -1,5 +1,4 @@
-using InferenceEngine;
-
+
 namespace InferenceWeb.Tests;
 
 public class MediaHelperTests
@@ -71,3 +70,4 @@ public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride()
         }
     }
 }
+
diff --git a/InferenceWeb.Tests/ModelServiceHistoryTests.cs b/InferenceWeb.Tests/ModelServiceHistoryTests.cs
@@ -1,6 +1,4 @@
-using InferenceEngine;
-using InferenceWeb;
-
+
 namespace InferenceWeb.Tests;
 
 public class ModelServiceHistoryTests
@@ -70,3 +68,5 @@ public void PrepareHistoryForInference_NormalizesEarlierVideoTurns()
         }
     }
 }
+
+
diff --git a/InferenceWeb.Tests/StructuredOutputTests.cs b/InferenceWeb.Tests/StructuredOutputTests.cs
@@ -1,6 +1,4 @@
-using System.Text.Json;
-using InferenceEngine;
-using InferenceWeb;
+using System.Text.Json;
 
 namespace InferenceWeb.Tests;
 
@@ -206,3 +204,5 @@ public void JsonSchemaNormalizationSupportsDefsAndAnyOf()
         Assert.Equal("""{"item":{"name":"Ada","age":30}}""", normalized.NormalizedContent);
     }
 }
+
+
diff --git a/InferenceWeb.Tests/WebUiChatPolicyTests.cs b/InferenceWeb.Tests/WebUiChatPolicyTests.cs
@@ -1,5 +1,4 @@
-using InferenceWeb;
-
+
 namespace InferenceWeb.Tests;
 
 public class WebUiChatPolicyTests
@@ -31,3 +30,4 @@ public void TryValidateChatRequest_RejectsPerTurnBackendSelection()
         Assert.Equal(WebUiChatPolicy.ModelSelectionLockedMessage, error);
     }
 }
+
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# TensorSharp
+# TensorSharp
 
 <p align="center">
   <img src="imgs/banner_1.png" alt="TensorSharp logo" width="320">
@@ -10,8 +10,8 @@ A C# inference engine for running large language models (LLMs) locally using GGU
 
 ## Features
 
-- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H
-- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5
+- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H, Mistral 3
+- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5 / Mistral 3
 - **Thinking / reasoning mode** -- structured chain-of-thought output with `<think>` / `<|channel>thought` / `<|channel>analysis` tags (Qwen 3, Qwen 3.5, Gemma 4, GPT OSS, Nemotron-H)
 - **Tool calling / function calling** -- models can invoke user-defined tools; multi-turn tool-call conversations supported across all three API styles
 - **Quantized model support** -- loads GGUF files with Q4_K_M, Q8_0, F16, MXFP4, and other quantization formats; performs native quantized matmul without dequantizing to FP32, including memory-efficient pure C# CPU loading for large GGUFs
@@ -38,6 +38,7 @@ A C# inference engine for running large language models (LLMs) locally using GGU
 | Qwen 3.5 | Qwen3.5-9B, Qwen3.5-35B-A3B | Image | Yes | Yes |
 | GPT OSS | gpt-oss-20b (MoE) | Text only | Yes | No |
 | Nemotron-H | Nemotron-H-8B, Nemotron-H-47B (Hybrid SSM-Transformer, MoE) | Text only | Yes | Yes |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | Image | No | No |
 
 See [Model Architecture Cards](docs/model_cards.md) for detailed documentation of each architecture.
 
@@ -58,6 +59,8 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
 | GPT OSS | gpt-oss-20b (MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
 | Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
 | Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
+| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
+| Mistral 3 | mistral3-mmproj (Pixtral vision projector) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
 
 ## Compute Backends
 
@@ -72,36 +75,37 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
 
 ```
 TensorSharp/
-├── TensorSharp/                 # Core tensor library (CPU operations, SIMD)
-├── TensorSharp.GGML/            # GGML backend bindings (Metal/CUDA/CPU via native library)
+├── TensorSharp.Core/            # Core tensor library (Tensor, Ops, memory, device abstraction)
+├── TensorSharp.Runtime/         # GGUF, tokenizers, templates, sampling, protocol parsing
+├── TensorSharp.Models/          # Model architectures and multimodal encoders/injectors
+├── TensorSharp.Backends.GGML/   # GGML backend bindings (Metal/CUDA/CPU via native library)
 ├── TensorSharp.GGML.Native/     # Native C++ bridge to ggml (builds libGgmlOps)
-├── AdvUtils/                    # Utility library
-├── InferenceEngine/             # Model loading, tokenization, and inference logic
-│   ├── Models/
-│   │   ├── Gemma3/
-│   │   ├── Gemma4/              # Vision encoder, audio encoder, MoE, fused GPU decode
-│   │   ├── GptOss/              # MoE, attention sinks, SiLUAlphaLimit, Yarn RoPE
-│   │   ├── Nemotron/            # Hybrid Mamba2 SSM + attention + MoE FFN
-│   │   ├── Qwen3/
-│   │   └── Qwen35/
-│   ├── GgufReader.cs            # GGUF file parser
-│   ├── ModelBase.cs             # Base class for all model architectures
-│   ├── ChatTemplate.cs          # Chat template rendering (hardcoded + Jinja2 from GGUF)
-│   ├── Jinja2Template.cs        # Jinja2 template renderer
-│   ├── OutputParser.cs          # Extracts thinking, content, and tool calls from model output
-│   ├── SamplingConfig.cs        # Sampling parameter configuration
-│   ├── TokenSampler.cs          # Token sampling (greedy, top-k, top-p, min-p, penalties)
-│   └── MediaHelper.cs           # Video frame extraction, audio decoding
-├── InferenceConsole/            # CLI application
-├── InferenceWeb/                # Web chatbot + API server (ASP.NET Core)
+├── TensorSharp.Server/          # Web chatbot + API server (ASP.NET Core)
 │   ├── ModelService.cs          # Model lifecycle management
 │   ├── InferenceQueue.cs        # FIFO request queue with position tracking
 │   ├── wwwroot/index.html       # Chat UI
 │   ├── testdata/                # Integration test suites (bash + Python)
 │   └── API_EXAMPLES.md          # Detailed API documentation
+├── TensorSharp.Cli/             # CLI application
+├── AdvUtils/                    # Utility library
 └── ExternalProjects/            # Third-party dependencies (ggml)
 ```
 
+## NuGet Packages
+
+The repository is now split along package boundaries so consumers can depend on only the layers they actually need.
+
+| Project | NuGet package | Public namespace | Responsibility |
+|---|---|---|---|
+| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor primitives, ops, allocators, storage, and device abstraction |
+| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF parsing, tokenizers, prompt rendering, sampling, and output protocol parsing |
+| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`, architecture implementations, multimodal encoders, and model-side execution helpers |
+| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML-backed execution and native interop |
+| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core server, OpenAI/Ollama adapters, queueing, and web UI |
+| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | Console host and debugging / batch tooling |
+
+This split keeps engine users off the web stack, keeps API-layer changes from leaking into core/runtime packages, and makes future benchmark or eval-harness projects easier to publish independently.
+
 ## Prerequisites
 
 - [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
@@ -121,10 +125,10 @@ dotnet build TensorSharp.slnx
 
 ```bash
 # Console application
-dotnet build InferenceConsole/InferenceConsole.csproj
+dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj
 
 # Web application
-dotnet build InferenceWeb/InferenceWeb.csproj
+dotnet build TensorSharp.Server/TensorSharp.Server.csproj
 ```
 
 ### Build the native GGML library
@@ -166,7 +170,7 @@ TENSORSHARP_GGML_NATIVE_BUILD_PARALLEL_LEVEL=2 bash build-linux.sh --cuda
 You can also request a CUDA-enabled native build from `dotnet build`:
 
 ```bash
-TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
+TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release
 ```
 
 On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory.
@@ -176,38 +180,38 @@ On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `bui
 ### Console Application
 
 ```bash
-cd InferenceConsole/bin
+cd TensorSharp.Cli/bin
 
 # Text inference
-./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
     --max-tokens 200 --backend ggml_metal
 
 # Text inference on Linux + NVIDIA GPU
-./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
     --max-tokens 200 --backend ggml_cuda
 
 # Image inference (Gemma 3/4, Qwen 3.5)
-./InferenceConsole --model <model.gguf> --image photo.png --backend ggml_metal
+./TensorSharp.Cli --model <model.gguf> --image photo.png --backend ggml_metal
 
 # Video inference (Gemma 4)
-./InferenceConsole --model <model.gguf> --video clip.mp4 --backend ggml_metal
+./TensorSharp.Cli --model <model.gguf> --video clip.mp4 --backend ggml_metal
 
 # Audio inference (Gemma 4)
-./InferenceConsole --model <model.gguf> --audio speech.wav --backend ggml_metal
+./TensorSharp.Cli --model <model.gguf> --audio speech.wav --backend ggml_metal
 
 # Thinking / reasoning mode
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal --think
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal --think
 
 # Tool calling
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
     --tools tools.json
 
 # With sampling parameters
-./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
+./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
     --temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42
 
 # Batch processing (JSONL)
-./InferenceConsole --model <model.gguf> --input-jsonl requests.jsonl \
+./TensorSharp.Cli --model <model.gguf> --input-jsonl requests.jsonl \
     --output results.txt --backend ggml_metal
 ```
 
@@ -253,13 +257,13 @@ Each line is a JSON object with `messages`, optional `prompt`, and optional samp
 ### Web Application
 
 ```bash
-cd InferenceWeb/bin
+cd TensorSharp.Server/bin
 
 # Set environment variables and run
-MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb
+MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server
 
 # Linux + NVIDIA GPU
-MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
+MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server
 ```
 
 Open `http://localhost:5000` in your browser. The web interface supports:
@@ -284,7 +288,7 @@ Open `http://localhost:5000` in your browser. The web interface supports:
 
 ### HTTP APIs
 
-InferenceWeb exposes three API styles. See [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md) for full documentation with curl and Python examples.
+TensorSharp.Server exposes three API styles. See [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md) for full documentation with curl and Python examples.
 
 **Ollama-compatible API:**
 
@@ -403,17 +407,27 @@ Gemma 4 models support image, video, and audio inputs. Place the multimodal proj
 
 These models support image inputs with their respective multimodal projector files.
 
+### Mistral 3
+
+Mistral 3 supports image inputs via the Pixtral vision encoder. Place the multimodal projector (`mistral3-mmproj.gguf`) in the same directory as the model file for automatic loading.
+
+- **Images:** PNG, JPEG
+
 ## Architecture
 
 TensorSharp is structured as a layered system:
 
-1. **TensorSharp** provides the core `Tensor` type, storage abstraction, and an extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
+1. **TensorSharp.Core** provides the core `Tensor` type, storage abstraction, and the extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
 
-2. **TensorSharp.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
+2. **TensorSharp.Runtime** owns runtime-facing contracts and services: GGUF parsing, tokenization (SentencePiece / BPE), chat template rendering, configurable token sampling, output parsing, and reusable contracts such as `IModelArchitecture`, `IPromptRenderer`, `IOutputProtocolParser`, `IMultimodalInjector`, `IKVCachePolicy`, and `IBackendExecutionPlan`.
 
-3. **InferenceEngine** implements model-specific logic: GGUF parsing, tokenization (SentencePiece BPE), chat template rendering (Jinja2 from GGUF metadata with hardcoded fallbacks), configurable token sampling, output parsing (thinking extraction, tool-call extraction), and the forward pass for each architecture (including hybrid SSM-Transformer models like Nemotron-H with Mamba2 layers). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
+3. **TensorSharp.Models** implements `ModelBase` plus the concrete architectures and multimodal helpers (Gemma 3/4, Qwen 3/3.5, GPT OSS, Nemotron-H, Mistral 3). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
 
-4. **InferenceConsole** and **InferenceWeb** are application layers that handle I/O and user interaction. InferenceWeb provides Ollama-compatible and OpenAI-compatible REST APIs alongside a browser-based chat UI, with a FIFO inference queue to serialize concurrent requests.
+4. **TensorSharp.Backends.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
+
+5. **TensorSharp.Server** is the HTTP/application layer. It provides Ollama-compatible and OpenAI-compatible REST APIs, the browser-based chat UI, upload handling, and the FIFO inference queue.
+
+6. **TensorSharp.Cli** is the console/application layer for local prompts, multimodal experiments, prompt inspection, and JSONL batch workflows.
 
 ### Performance Optimizations
 
@@ -426,16 +440,16 @@ TensorSharp is structured as a layered system:
 
 ## Testing
 
-Integration tests for InferenceWeb are in `InferenceWeb/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
+Integration tests for TensorSharp.Server are in `TensorSharp.Server/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
 
 ```bash
-# Start InferenceWeb, then run:
-python3 InferenceWeb/testdata/test_multiturn.py
+# Start TensorSharp.Server, then run:
+python3 TensorSharp.Server/testdata/test_multiturn.py
 # or
-bash InferenceWeb/testdata/test_multiturn.sh
+bash TensorSharp.Server/testdata/test_multiturn.sh
 ```
 
-See [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md) for the full test matrix.
+See [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md) for the full test matrix.
 
 ## Author
 
@@ -444,3 +458,4 @@ Zhongkai Fu
 ## License
 
 See [LICENSE](LICENSE) for details.
+