Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions InferenceWeb.Tests/BackendCatalogTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using InferenceEngine;
using InferenceWeb;
using TensorSharp.GGML;
using TensorSharp.GGML;

namespace InferenceWeb.Tests;

Expand Down Expand Up @@ -121,3 +119,5 @@ public void ShouldStoreWeightQuantized_GgmlBackendsKeepQuantizedWeights()
Assert.True(shouldStoreQuantized);
}
}


3 changes: 3 additions & 0 deletions InferenceWeb.Tests/GlobalUsings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
global using TensorSharp.Models;
global using TensorSharp.Runtime;
global using TensorSharp.Server;
4 changes: 2 additions & 2 deletions InferenceWeb.Tests/ImageProcessorTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using InferenceEngine;


namespace InferenceWeb.Tests;

public class ImageProcessorTests
Expand Down Expand Up @@ -95,3 +94,4 @@ private static string WriteEmbeddedJpeg()
return path;
}
}

5 changes: 3 additions & 2 deletions InferenceWeb.Tests/InferenceWeb.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
<Using Include="Xunit" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\InferenceEngine\InferenceEngine.csproj" />
<ProjectReference Include="..\InferenceWeb\InferenceWeb.csproj" />
<ProjectReference Include="..\TensorSharp.Runtime\TensorSharp.Runtime.csproj" />
<ProjectReference Include="..\TensorSharp.Models\TensorSharp.Models.csproj" />
<ProjectReference Include="..\TensorSharp.Server\TensorSharp.Server.csproj" />
</ItemGroup>
</Project>
4 changes: 2 additions & 2 deletions InferenceWeb.Tests/KVCacheTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using InferenceWeb;


namespace InferenceWeb.Tests;

public class KVCacheTests
Expand Down Expand Up @@ -129,3 +128,4 @@ public void FindTokenPrefixLength_ThinkingModelWithContentInContext()
Assert.Equal(8, common); // Full cached is prefix
}
}

4 changes: 2 additions & 2 deletions InferenceWeb.Tests/ManagedQuantizedOpsTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.Buffers.Binary;
using InferenceEngine;
using System.Buffers.Binary;

namespace InferenceWeb.Tests;

Expand Down Expand Up @@ -168,3 +167,4 @@ private static float Dot(float[] lhs, float[] rhs, int rhsOffset, int length)
return sum;
}
}

4 changes: 2 additions & 2 deletions InferenceWeb.Tests/MediaHelperTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using InferenceEngine;


namespace InferenceWeb.Tests;

public class MediaHelperTests
Expand Down Expand Up @@ -71,3 +70,4 @@ public void GetConfiguredMaxVideoFramesUsesPositiveEnvironmentOverride()
}
}
}

6 changes: 3 additions & 3 deletions InferenceWeb.Tests/ModelServiceHistoryTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using InferenceEngine;
using InferenceWeb;


namespace InferenceWeb.Tests;

public class ModelServiceHistoryTests
Expand Down Expand Up @@ -70,3 +68,5 @@ public void PrepareHistoryForInference_NormalizesEarlierVideoTurns()
}
}
}


6 changes: 3 additions & 3 deletions InferenceWeb.Tests/StructuredOutputTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System.Text.Json;
using InferenceEngine;
using InferenceWeb;
using System.Text.Json;

namespace InferenceWeb.Tests;

Expand Down Expand Up @@ -206,3 +204,5 @@ public void JsonSchemaNormalizationSupportsDefsAndAnyOf()
Assert.Equal("""{"item":{"name":"Ada","age":30}}""", normalized.NormalizedContent);
}
}


4 changes: 2 additions & 2 deletions InferenceWeb.Tests/WebUiChatPolicyTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using InferenceWeb;


namespace InferenceWeb.Tests;

public class WebUiChatPolicyTests
Expand Down Expand Up @@ -31,3 +30,4 @@ public void TryValidateChatRequest_RejectsPerTurnBackendSelection()
Assert.Equal(WebUiChatPolicy.ModelSelectionLockedMessage, error);
}
}

115 changes: 65 additions & 50 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TensorSharp
# TensorSharp

<p align="center">
<img src="imgs/banner_1.png" alt="TensorSharp logo" width="320">
Expand All @@ -10,8 +10,8 @@ A C# inference engine for running large language models (LLMs) locally using GGU

## Features

- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H
- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5
- **Multi-architecture support** -- Gemma 4, Gemma 3, Qwen 3, Qwen 3.5, GPT OSS, Nemotron-H, Mistral 3
- **Multimodal inference** -- image, video, and audio inputs (Gemma 4); images for Gemma 3 / Qwen 3.5 / Mistral 3
- **Thinking / reasoning mode** -- structured chain-of-thought output with `<think>` / `<|channel>thought` / `<|channel>analysis` tags (Qwen 3, Qwen 3.5, Gemma 4, GPT OSS, Nemotron-H)
- **Tool calling / function calling** -- models can invoke user-defined tools; multi-turn tool-call conversations supported across all three API styles
- **Quantized model support** -- loads GGUF files with Q4_K_M, Q8_0, F16, MXFP4, and other quantization formats; performs native quantized matmul without dequantizing to FP32, including memory-efficient pure C# CPU loading for large GGUFs
Expand All @@ -38,6 +38,7 @@ A C# inference engine for running large language models (LLMs) locally using GGU
| Qwen 3.5 | Qwen3.5-9B, Qwen3.5-35B-A3B | Image | Yes | Yes |
| GPT OSS | gpt-oss-20b (MoE) | Text only | Yes | No |
| Nemotron-H | Nemotron-H-8B, Nemotron-H-47B (Hybrid SSM-Transformer, MoE) | Text only | Yes | Yes |
| Mistral 3 | Mistral-Small-3.1-24B-Instruct | Image | No | No |

See [Model Architecture Cards](docs/model_cards.md) for detailed documentation of each architecture.

Expand All @@ -58,6 +59,8 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you
| GPT OSS | gpt-oss-20b (MoE) | [ggml-org/gpt-oss-20b-GGUF](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) |
| Nemotron-H | Nemotron-H-8B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-8B-Reasoning-128K-GGUF) |
| Nemotron-H | Nemotron-H-47B-Reasoning-128K | [bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF](https://huggingface.co/bartowski/nvidia_Nemotron-H-47B-Reasoning-128K-GGUF) |
| Mistral 3 | Mistral-Small-3.1-24B-Instruct | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |
| Mistral 3 | mistral3-mmproj (Pixtral vision projector) | [bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF](https://huggingface.co/bartowski/Mistral-Small-3.1-24B-Instruct-2503-GGUF) |

## Compute Backends

Expand All @@ -72,36 +75,37 @@ TensorSharp loads models in GGUF format. Below are Hugging Face links where you

```
TensorSharp/
├── TensorSharp/ # Core tensor library (CPU operations, SIMD)
├── TensorSharp.GGML/ # GGML backend bindings (Metal/CUDA/CPU via native library)
├── TensorSharp.Core/ # Core tensor library (Tensor, Ops, memory, device abstraction)
├── TensorSharp.Runtime/ # GGUF, tokenizers, templates, sampling, protocol parsing
├── TensorSharp.Models/ # Model architectures and multimodal encoders/injectors
├── TensorSharp.Backends.GGML/ # GGML backend bindings (Metal/CUDA/CPU via native library)
├── TensorSharp.GGML.Native/ # Native C++ bridge to ggml (builds libGgmlOps)
├── AdvUtils/ # Utility library
├── InferenceEngine/ # Model loading, tokenization, and inference logic
│ ├── Models/
│ │ ├── Gemma3/
│ │ ├── Gemma4/ # Vision encoder, audio encoder, MoE, fused GPU decode
│ │ ├── GptOss/ # MoE, attention sinks, SiLUAlphaLimit, Yarn RoPE
│ │ ├── Nemotron/ # Hybrid Mamba2 SSM + attention + MoE FFN
│ │ ├── Qwen3/
│ │ └── Qwen35/
│ ├── GgufReader.cs # GGUF file parser
│ ├── ModelBase.cs # Base class for all model architectures
│ ├── ChatTemplate.cs # Chat template rendering (hardcoded + Jinja2 from GGUF)
│ ├── Jinja2Template.cs # Jinja2 template renderer
│ ├── OutputParser.cs # Extracts thinking, content, and tool calls from model output
│ ├── SamplingConfig.cs # Sampling parameter configuration
│ ├── TokenSampler.cs # Token sampling (greedy, top-k, top-p, min-p, penalties)
│ └── MediaHelper.cs # Video frame extraction, audio decoding
├── InferenceConsole/ # CLI application
├── InferenceWeb/ # Web chatbot + API server (ASP.NET Core)
├── TensorSharp.Server/ # Web chatbot + API server (ASP.NET Core)
│ ├── ModelService.cs # Model lifecycle management
│ ├── InferenceQueue.cs # FIFO request queue with position tracking
│ ├── wwwroot/index.html # Chat UI
│ ├── testdata/ # Integration test suites (bash + Python)
│ └── API_EXAMPLES.md # Detailed API documentation
├── TensorSharp.Cli/ # CLI application
├── AdvUtils/ # Utility library
└── ExternalProjects/ # Third-party dependencies (ggml)
```

## NuGet Packages

The repository is now split along package boundaries so consumers can depend on only the layers they actually need.

| Project | NuGet package | Public namespace | Responsibility |
|---|---|---|---|
| `TensorSharp.Core` | `TensorSharp.Core` | `TensorSharp` | Tensor primitives, ops, allocators, storage, and device abstraction |
| `TensorSharp.Runtime` | `TensorSharp.Runtime` | `TensorSharp.Runtime` | GGUF parsing, tokenizers, prompt rendering, sampling, and output protocol parsing |
| `TensorSharp.Models` | `TensorSharp.Models` | `TensorSharp.Models` | `ModelBase`, architecture implementations, multimodal encoders, and model-side execution helpers |
| `TensorSharp.Backends.GGML` | `TensorSharp.Backends.GGML` | `TensorSharp.GGML` | GGML-backed execution and native interop |
| `TensorSharp.Server` | `TensorSharp.Server` | `TensorSharp.Server` | ASP.NET Core server, OpenAI/Ollama adapters, queueing, and web UI |
| `TensorSharp.Cli` | `TensorSharp.Cli` | `TensorSharp.Cli` | Console host and debugging / batch tooling |

This split keeps engine users off the web stack, keeps API-layer changes from leaking into core/runtime packages, and makes future benchmark or eval-harness projects easier to publish independently.

## Prerequisites

- [.NET 10 SDK](https://dotnet.microsoft.com/download/dotnet/10.0)
Expand All @@ -121,10 +125,10 @@ dotnet build TensorSharp.slnx

```bash
# Console application
dotnet build InferenceConsole/InferenceConsole.csproj
dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj

# Web application
dotnet build InferenceWeb/InferenceWeb.csproj
dotnet build TensorSharp.Server/TensorSharp.Server.csproj
```

### Build the native GGML library
Expand Down Expand Up @@ -166,7 +170,7 @@ TENSORSHARP_GGML_NATIVE_BUILD_PARALLEL_LEVEL=2 bash build-linux.sh --cuda
You can also request a CUDA-enabled native build from `dotnet build`:

```bash
TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build InferenceConsole/InferenceConsole.csproj -c Release
TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON dotnet build TensorSharp.Cli/TensorSharp.Cli.csproj -c Release
```

On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `build-linux.sh` preserves an existing CUDA-enabled build and auto-enables GGML_CUDA when a CUDA toolchain is detected; `build-linux.sh --cuda` and `TENSORSHARP_GGML_NATIVE_ENABLE_CUDA=ON` force CUDA explicitly. The build output is automatically copied to the application's output directory.
Expand All @@ -176,38 +180,38 @@ On macOS this compiles `libGgmlOps.dylib` with Metal GPU support. On Linux, `bui
### Console Application

```bash
cd InferenceConsole/bin
cd TensorSharp.Cli/bin

# Text inference
./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
--max-tokens 200 --backend ggml_metal

# Text inference on Linux + NVIDIA GPU
./InferenceConsole --model <model.gguf> --input prompt.txt --output result.txt \
./TensorSharp.Cli --model <model.gguf> --input prompt.txt --output result.txt \
--max-tokens 200 --backend ggml_cuda

# Image inference (Gemma 3/4, Qwen 3.5)
./InferenceConsole --model <model.gguf> --image photo.png --backend ggml_metal
./TensorSharp.Cli --model <model.gguf> --image photo.png --backend ggml_metal

# Video inference (Gemma 4)
./InferenceConsole --model <model.gguf> --video clip.mp4 --backend ggml_metal
./TensorSharp.Cli --model <model.gguf> --video clip.mp4 --backend ggml_metal

# Audio inference (Gemma 4)
./InferenceConsole --model <model.gguf> --audio speech.wav --backend ggml_metal
./TensorSharp.Cli --model <model.gguf> --audio speech.wav --backend ggml_metal

# Thinking / reasoning mode
./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal --think
./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal --think

# Tool calling
./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
--tools tools.json

# With sampling parameters
./InferenceConsole --model <model.gguf> --input prompt.txt --backend ggml_metal \
./TensorSharp.Cli --model <model.gguf> --input prompt.txt --backend ggml_metal \
--temperature 0.7 --top-p 0.9 --top-k 40 --repeat-penalty 1.2 --seed 42

# Batch processing (JSONL)
./InferenceConsole --model <model.gguf> --input-jsonl requests.jsonl \
./TensorSharp.Cli --model <model.gguf> --input-jsonl requests.jsonl \
--output results.txt --backend ggml_metal
```

Expand Down Expand Up @@ -253,13 +257,13 @@ Each line is a JSON object with `messages`, optional `prompt`, and optional samp
### Web Application

```bash
cd InferenceWeb/bin
cd TensorSharp.Server/bin

# Set environment variables and run
MODEL_DIR=./models BACKEND=ggml_metal ./InferenceWeb
MODEL_DIR=./models BACKEND=ggml_metal ./TensorSharp.Server

# Linux + NVIDIA GPU
MODEL_DIR=./models BACKEND=ggml_cuda ./InferenceWeb
MODEL_DIR=./models BACKEND=ggml_cuda ./TensorSharp.Server
```

Open `http://localhost:5000` in your browser. The web interface supports:
Expand All @@ -284,7 +288,7 @@ Open `http://localhost:5000` in your browser. The web interface supports:

### HTTP APIs

InferenceWeb exposes three API styles. See [API_EXAMPLES.md](InferenceWeb/API_EXAMPLES.md) for full documentation with curl and Python examples.
TensorSharp.Server exposes three API styles. See [API_EXAMPLES.md](TensorSharp.Server/API_EXAMPLES.md) for full documentation with curl and Python examples.

**Ollama-compatible API:**

Expand Down Expand Up @@ -403,17 +407,27 @@ Gemma 4 models support image, video, and audio inputs. Place the multimodal proj

These models support image inputs with their respective multimodal projector files.

### Mistral 3

Mistral 3 supports image inputs via the Pixtral vision encoder. Place the multimodal projector (`mistral3-mmproj.gguf`) in the same directory as the model file for automatic loading.

- **Images:** PNG, JPEG

## Architecture

TensorSharp is structured as a layered system:

1. **TensorSharp** provides the core `Tensor` type, storage abstraction, and an extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.
1. **TensorSharp.Core** provides the core `Tensor` type, storage abstraction, and the extensible operation registry (`Ops`). CPU implementations use `System.Numerics.Vectors` for SIMD acceleration.

2. **TensorSharp.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.
2. **TensorSharp.Runtime** owns runtime-facing contracts and services: GGUF parsing, tokenization (SentencePiece / BPE), chat template rendering, configurable token sampling, output parsing, and reusable contracts such as `IModelArchitecture`, `IPromptRenderer`, `IOutputProtocolParser`, `IMultimodalInjector`, `IKVCachePolicy`, and `IBackendExecutionPlan`.

3. **InferenceEngine** implements model-specific logic: GGUF parsing, tokenization (SentencePiece BPE), chat template rendering (Jinja2 from GGUF metadata with hardcoded fallbacks), configurable token sampling, output parsing (thinking extraction, tool-call extraction), and the forward pass for each architecture (including hybrid SSM-Transformer models like Nemotron-H with Mamba2 layers). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.
3. **TensorSharp.Models** implements `ModelBase` plus the concrete architectures and multimodal helpers (Gemma 3/4, Qwen 3/3.5, GPT OSS, Nemotron-H, Mistral 3). Models are loaded via `ModelBase.Create()` which auto-detects the architecture from GGUF metadata.

4. **InferenceConsole** and **InferenceWeb** are application layers that handle I/O and user interaction. InferenceWeb provides Ollama-compatible and OpenAI-compatible REST APIs alongside a browser-based chat UI, with a FIFO inference queue to serialize concurrent requests.
4. **TensorSharp.Backends.GGML** registers accelerated implementations of the same operations via a native C++ bridge (`libGgmlOps`) that links against [ggml](https://github.com/ggml-org/ggml). On macOS this provides Metal GPU compute, and on Linux it can expose GGML CUDA for NVIDIA GPUs. Operations include native quantized matmul (Q4_K_M, Q8_0, etc.) without dequantizing to FP32.

5. **TensorSharp.Server** is the HTTP/application layer. It provides Ollama-compatible and OpenAI-compatible REST APIs, the browser-based chat UI, upload handling, and the FIFO inference queue.

6. **TensorSharp.Cli** is the console/application layer for local prompts, multimodal experiments, prompt inspection, and JSONL batch workflows.

### Performance Optimizations

Expand All @@ -426,16 +440,16 @@ TensorSharp is structured as a layered system:

## Testing

Integration tests for InferenceWeb are in `InferenceWeb/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.
Integration tests for TensorSharp.Server are in `TensorSharp.Server/testdata/`. They cover all three API styles (Web UI SSE, Ollama, OpenAI), multi-turn conversations, thinking mode, tool calling, structured outputs, queue behavior, concurrent requests, and abort support.

```bash
# Start InferenceWeb, then run:
python3 InferenceWeb/testdata/test_multiturn.py
# Start TensorSharp.Server, then run:
python3 TensorSharp.Server/testdata/test_multiturn.py
# or
bash InferenceWeb/testdata/test_multiturn.sh
bash TensorSharp.Server/testdata/test_multiturn.sh
```

See [InferenceWeb/testdata/README.md](InferenceWeb/testdata/README.md) for the full test matrix.
See [TensorSharp.Server/testdata/README.md](TensorSharp.Server/testdata/README.md) for the full test matrix.

## Author

Expand All @@ -444,3 +458,4 @@ Zhongkai Fu
## License

See [LICENSE](LICENSE) for details.

Loading