Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions benchmarks/kaggle/pack_ternary_bonsai/kernel-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"id": "superkaiii/pack-ternary-bonsai-4b-tq1_0",
"title": "Pack Ternary-Bonsai-4B TQ1_0",
"code_file": "pack_ternary_bonsai.py",
"language": "python",
"kernel_type": "script",
"is_private": true,
"enable_gpu": false,
"enable_internet": true,
"competition_sources": [],
"dataset_sources": ["superkaiii/hf-token-private"],
"kernel_sources": []
}
197 changes: 197 additions & 0 deletions benchmarks/kaggle/pack_ternary_bonsai/pack_ternary_bonsai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""Kaggle kernel: re-quantize Ternary-Bonsai-4B from unpacked FP16 to TQ1_0 GGUF.

Why here: the conversion needs ~17 GB peak disk + a llama.cpp build. Doing it on
Kaggle keeps it off the dev machine and publishes the result to HF Hub so every
GraphStore dev pulls the same artifact.

Inputs (Kaggle kernel config):
- dataset_sources: ["superkaiii/hf-token-private"] (provides HF_TOKEN file)
- enable_internet: true
- enable_gpu: false (conversion is CPU-bound)

Flow:
1. Read HF write token from /kaggle/input/hf-token-private/HF_TOKEN
2. Download prism-ml/Ternary-Bonsai-4B-unpacked (FP16 safetensors, ~8 GB)
3. Clone + build llama.cpp llama-quantize
4. convert_hf_to_gguf.py -> F16 GGUF (~8 GB)
5. llama-quantize TQ1_0 -> ~1 GB GGUF, the pack step
6. Upload result to superkaiii/Ternary-Bonsai-4B-TQ1_0-GGUF on HF Hub
7. Delete intermediates so peak disk stays under 20 GB Kaggle quota

Output (Kaggle kernel):
/kaggle/working/Ternary-Bonsai-4B-TQ1_0.gguf (also on HF Hub after upload)
/kaggle/working/pack_report.json (sizes + checksums)
"""
from __future__ import annotations

import hashlib
import json
import os
import shutil
import subprocess
from pathlib import Path


HF_REPO = "superkaiii/Ternary-Bonsai-4B-TQ1_0-GGUF"
SOURCE_REPO = "prism-ml/Ternary-Bonsai-4B-unpacked"
TARGET_QUANT = "TQ1_0"
OUTPUT_NAME = "Ternary-Bonsai-4B-TQ1_0.gguf"

SRC_DIR = Path("/kaggle/tmp/bonsai-src")
LC_DIR = Path("/kaggle/tmp/llama.cpp")
F16_GGUF = Path("/kaggle/tmp/bonsai-4b-f16.gguf")
WORKING = Path("/kaggle/working")
OUT_GGUF = WORKING / OUTPUT_NAME
REPORT = WORKING / "pack_report.json"


def run(cmd: list[str] | str, **kw) -> None:
"""Shell command with live output. Raises on nonzero exit."""
shell = isinstance(cmd, str)
print(f"$ {cmd if shell else ' '.join(cmd)}", flush=True)
subprocess.run(cmd, check=True, shell=shell, **kw)


def sha256(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1 << 20), b""):
h.update(chunk)
return h.hexdigest()


def load_hf_token() -> str:
"""Find HF token matching the kaggle_benchmark.py pattern."""
# Primary: same path the working kernel uses.
primary = Path("/kaggle/input/hf-token-private/hf_token.txt")
if primary.exists():
return primary.read_text().strip()
# Fallback: glob recursively in case Kaggle mounts at a nested path.
for name in ("hf_token.txt", "HF_TOKEN", "token"):
hits = list(Path("/kaggle/input").rglob(name))
if hits:
return hits[0].read_text().strip()
raise RuntimeError("HF token not found under /kaggle/input/")


def main() -> None:
WORKING.mkdir(parents=True, exist_ok=True)
SRC_DIR.mkdir(parents=True, exist_ok=True)

hf_token = load_hf_token()
os.environ["HF_TOKEN"] = hf_token
os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token

print("[1/6] pip deps")
run("pip install -q 'huggingface_hub>=0.24' 'transformers>=4.44' 'safetensors>=0.4' 'torch>=2.3' 'sentencepiece' 'protobuf'")

print("[2/6] download source F16 safetensors (Python API, not CLI)")
from huggingface_hub import snapshot_download
snapshot_download(
repo_id=SOURCE_REPO,
local_dir=str(SRC_DIR),
token=hf_token,
allow_patterns=["*.safetensors", "*.json", "*.model", "tokenizer*"],
)

print("[3/6] clone + build llama.cpp llama-quantize target")
if not LC_DIR.exists():
run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp", str(LC_DIR)])
run(f"cd {LC_DIR} && cmake -B build -DLLAMA_BUILD_SERVER=OFF -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_TESTS=OFF && cmake --build build --target llama-quantize -j")

print("[4/6] convert safetensors -> F16 GGUF")
run([
"python", str(LC_DIR / "convert_hf_to_gguf.py"),
str(SRC_DIR),
"--outfile", str(F16_GGUF),
"--outtype", "f16",
])

print("[5/6] quantize F16 -> TQ1_0 (the pack step)")
run([
str(LC_DIR / "build" / "bin" / "llama-quantize"),
str(F16_GGUF),
str(OUT_GGUF),
TARGET_QUANT,
])

# Drop intermediates aggressively to stay under the Kaggle 20 GB working quota.
if F16_GGUF.exists():
F16_GGUF.unlink()
shutil.rmtree(SRC_DIR, ignore_errors=True)

print("[6/6] upload to HF Hub")
from huggingface_hub import HfApi

api = HfApi(token=hf_token)
api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)

readme = WORKING / "README.md"
readme.write_text(_readme())

api.upload_file(
path_or_fileobj=str(OUT_GGUF),
path_in_repo=OUTPUT_NAME,
repo_id=HF_REPO,
repo_type="model",
)
api.upload_file(
path_or_fileobj=str(readme),
path_in_repo="README.md",
repo_id=HF_REPO,
repo_type="model",
)

report = {
"source_repo": SOURCE_REPO,
"target_repo": HF_REPO,
"quant": TARGET_QUANT,
"output_bytes": OUT_GGUF.stat().st_size,
"output_sha256": sha256(OUT_GGUF),
}
REPORT.write_text(json.dumps(report, indent=2))
print(f"\nDONE\n local: {OUT_GGUF} ({OUT_GGUF.stat().st_size / 1e9:.2f} GB)")
print(f" hub: https://huggingface.co/{HF_REPO}")
print(f" report: {REPORT}")


def _readme() -> str:
return f"""---
library_name: gguf
base_model: {SOURCE_REPO}
quantization_config:
method: {TARGET_QUANT}
tags:
- gguf
- ternary
- bonsai
- llama.cpp
---

# Ternary-Bonsai-4B TQ1_0

Repack of [{SOURCE_REPO}](https://huggingface.co/{SOURCE_REPO}) to llama.cpp's native ternary format ({TARGET_QUANT}).

Ternary-Bonsai weights are trained as {{-1, 0, +1}}. {TARGET_QUANT} packs these losslessly at 1.6875 bits per weight with a single FP16 scale per group of 256 weights. Unlike Q2_K (which k-means clusters generic floats), {TARGET_QUANT} preserves the original trained weight values exactly.

## Quickstart

```python
from llama_cpp import Llama
m = Llama(model_path="{OUTPUT_NAME}", n_ctx=4096)
```

## Conversion pipeline

```
prism-ml/Ternary-Bonsai-4B-unpacked (F16 safetensors)
-> convert_hf_to_gguf.py --outtype f16
-> llama-quantize ... {TARGET_QUANT}
```

Built on Kaggle with ggerganov/llama.cpp master.
"""


if __name__ == "__main__":
main()
Loading
Loading