Skip to content

Commit 754b069

Browse files
authored
[Xet] Basic shard creation (#1633)
cc @Kakulukian @assafvayner for viz, follow up to #1616 Based on https://github.com/huggingface/xet-core/blob/7e41fb0dd7cfb276222b9668d0b97a984647721e/spec/shard.md Need to handle: - split into multiple shards when xorb or file info grows too big - uploading xorbs & shards (and we need to upload xorbs before shards referencing them)
1 parent f36d987 commit 754b069

20 files changed

+5457
-1894
lines changed

packages/hub/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@
4242
"test": "vitest run",
4343
"test:browser": "vitest run --browser.name=chrome --browser.headless --config vitest-browser.config.mts",
4444
"check": "tsc",
45-
"build:xet-wasm": "./scripts/build-xet-wasm.sh -t bundler -c -b hoytak/250714-eliminate-mdb-v1"
45+
"build:xet-wasm": "./scripts/build-xet-wasm.sh -t bundler --clean",
46+
"bench": "tsx scripts/bench.ts"
4647
},
4748
"files": [
4849
"src",

packages/hub/scripts/bench.ts

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
import { uploadShards } from "../src/utils/uploadShards.js";
2+
import { sha256 } from "../src/utils/sha256.js";
3+
import { parseArgs } from "node:util";
4+
import { tmpdir } from "node:os";
5+
import { join } from "node:path";
6+
import { writeFile, readFile, stat, mkdir } from "node:fs/promises";
7+
import type { RepoId } from "../src/types/public.js";
8+
import { toRepoId } from "../src/utils/toRepoId.js";
9+
import { commitIter } from "../src/index.js";
10+
import { pathToFileURL } from "node:url";
11+
12+
/**
13+
* This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.
14+
* It prints the dedup % and the statistics
15+
*
16+
* Usage:
17+
*
18+
* pnpm --filter hub bench -t <write token> -r <xet repo>
19+
* pnpm --filter hub bench -t <write token> -r <xet repo> --commit # Actually upload files
20+
*/
21+
22+
const FILES_TO_DOWNLOAD = [
23+
{
24+
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true",
25+
filename: "64-8bits.tflite",
26+
},
27+
{
28+
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-fp16.tflite?download=true",
29+
filename: "64-fp16.tflite",
30+
},
31+
];
32+
33+
async function downloadFileIfNotExists(url: string, filepath: string): Promise<void> {
34+
try {
35+
await stat(filepath);
36+
console.log(`File ${filepath} already exists, skipping download`);
37+
return;
38+
} catch {
39+
// File doesn't exist, proceed with download
40+
}
41+
42+
console.log(`Downloading ${url} to ${filepath}...`);
43+
const response = await fetch(url);
44+
if (!response.ok) {
45+
throw new Error(`Failed to download ${url}: ${response.status} ${response.statusText}`);
46+
}
47+
48+
const buffer = await response.arrayBuffer();
49+
await writeFile(filepath, new Uint8Array(buffer));
50+
console.log(`Downloaded ${filepath} (${buffer.byteLength} bytes)`);
51+
}
52+
53+
async function* createFileSource(
54+
files: Array<{ filepath: string; filename: string }>
55+
): AsyncGenerator<{ content: Blob; path: string; sha256: string }> {
56+
for (const file of files) {
57+
console.log(`Processing ${file.filename}...`);
58+
const buffer = await readFile(file.filepath);
59+
const blob = new Blob([buffer]);
60+
61+
// Calculate sha256
62+
console.log(`Calculating SHA256 for ${file.filename}...`);
63+
const sha256Iterator = sha256(blob, { useWebWorker: false });
64+
let res: IteratorResult<number, string>;
65+
do {
66+
res = await sha256Iterator.next();
67+
} while (!res.done);
68+
const sha256Hash = res.value;
69+
70+
console.log(`SHA256 for ${file.filename}: ${sha256Hash}`);
71+
yield {
72+
content: blob,
73+
path: file.filename,
74+
sha256: sha256Hash,
75+
};
76+
}
77+
}
78+
79+
function getBodySize(body: RequestInit["body"]): string {
80+
if (!body) {
81+
return "no body";
82+
}
83+
if (body instanceof ArrayBuffer) {
84+
return body.byteLength.toString();
85+
}
86+
if (body instanceof Blob) {
87+
return "blob";
88+
}
89+
if (body instanceof Uint8Array) {
90+
return body.byteLength.toString();
91+
}
92+
return "unknown size";
93+
}
94+
95+
function createMockFetch(): {
96+
fetch: typeof fetch;
97+
getStats: () => { xorbCount: number; shardCount: number; xorbBytes: number; shardBytes: number };
98+
} {
99+
let xorbCount = 0;
100+
let shardCount = 0;
101+
let xorbBytes = 0;
102+
let shardBytes = 0;
103+
104+
const mockFetch = async function (input: string | URL | Request, init?: RequestInit): Promise<Response> {
105+
const url = typeof input === "string" ? input : input.toString();
106+
107+
// Mock successful responses for xorb and shard uploads
108+
if (url.includes("/xorb/")) {
109+
xorbCount++;
110+
const bodySize = getBodySize(init?.body);
111+
xorbBytes += parseInt(bodySize);
112+
console.log(`[MOCK] Xorb upload ${xorbCount}: ${init?.method || "GET"} ${url} (${bodySize})`);
113+
114+
return new Response(null, {
115+
status: 200,
116+
statusText: "OK",
117+
});
118+
}
119+
120+
if (url.endsWith("/shard")) {
121+
shardCount++;
122+
const bodySize = getBodySize(init?.body);
123+
shardBytes += parseInt(bodySize);
124+
console.log(`[MOCK] Shard upload ${shardCount}: ${init?.method || "GET"} ${url} (${bodySize})`);
125+
126+
return new Response(null, {
127+
status: 200,
128+
statusText: "OK",
129+
});
130+
}
131+
132+
// For other requests, use real fetch
133+
return fetch(input, init).then((res) => {
134+
console.log(`[real] ${res.status} ${res.statusText} ${url} ${res.headers.get("content-length")}`);
135+
return res;
136+
});
137+
};
138+
139+
return {
140+
fetch: mockFetch,
141+
getStats: () => ({ xorbCount, shardCount, xorbBytes, shardBytes }),
142+
};
143+
}
144+
145+
async function main() {
146+
const { values: args } = parseArgs({
147+
options: {
148+
token: {
149+
type: "string",
150+
short: "t",
151+
},
152+
repo: {
153+
type: "string",
154+
short: "r",
155+
},
156+
commit: {
157+
type: "boolean",
158+
short: "c",
159+
default: false,
160+
},
161+
},
162+
});
163+
164+
if (!args.token || !args.repo) {
165+
console.error("Usage: pnpm --filter hub bench -t <write token> -r <xet repo>");
166+
console.error("Example: pnpm --filter hub bench -t hf_... -r myuser/myrepo");
167+
process.exit(1);
168+
}
169+
170+
// Setup temp directory
171+
const tempDir = tmpdir();
172+
const downloadDir = join(tempDir, "hf-bench-downloads");
173+
174+
// Ensure download directory exists
175+
await mkdir(downloadDir, { recursive: true });
176+
177+
// Download files
178+
const files: Array<{ filepath: string; filename: string }> = [];
179+
180+
for (const fileInfo of FILES_TO_DOWNLOAD) {
181+
const filepath = join(downloadDir, fileInfo.filename);
182+
await downloadFileIfNotExists(fileInfo.url, filepath);
183+
files.push({ filepath, filename: fileInfo.filename });
184+
}
185+
186+
// Parse repo
187+
const repoName = args.repo;
188+
189+
const repo: RepoId = toRepoId(repoName);
190+
191+
// Create mock fetch
192+
const mockFetchObj = createMockFetch();
193+
194+
// Setup upload parameters
195+
const uploadParams = {
196+
accessToken: args.token,
197+
hubUrl: "https://huggingface.co",
198+
customFetch: mockFetchObj.fetch,
199+
repo,
200+
rev: "main",
201+
};
202+
203+
// Track statistics
204+
const stats: Array<{
205+
filename: string;
206+
size: number;
207+
dedupRatio: number;
208+
}> = [];
209+
210+
console.log("\n=== Starting upload simulation ===");
211+
212+
// Process files through uploadShards
213+
const fileSource = createFileSource(files);
214+
215+
for await (const event of uploadShards(fileSource, uploadParams)) {
216+
switch (event.event) {
217+
case "file": {
218+
console.log(`\n📁 Processed file: ${event.path}`);
219+
console.log(` SHA256: ${event.sha256}`);
220+
console.log(` Dedup ratio: ${(event.dedupRatio * 100).toFixed(2)}%`);
221+
222+
// Find the file size
223+
const file = files.find((f) => f.filename === event.path);
224+
if (file) {
225+
const fileStats = await stat(file.filepath);
226+
227+
stats.push({
228+
filename: event.path,
229+
size: fileStats.size,
230+
dedupRatio: event.dedupRatio,
231+
});
232+
}
233+
break;
234+
}
235+
236+
case "fileProgress": {
237+
const progress = (event.progress * 100).toFixed(1);
238+
console.log(` 📈 Progress for ${event.path}: ${progress}%`);
239+
break;
240+
}
241+
}
242+
}
243+
244+
// Get actual upload counts from the mock fetch
245+
const uploadStats = mockFetchObj.getStats();
246+
console.log(`\n📊 Actual upload counts: ${uploadStats.xorbCount} xorbs, ${uploadStats.shardCount} shards`);
247+
248+
// Output final statistics
249+
console.log("\n=== BENCHMARK RESULTS ===");
250+
console.log("File Statistics:");
251+
console.log("================");
252+
253+
for (const stat of stats) {
254+
console.log(`\n📄 ${stat.filename}:`);
255+
console.log(` Size: ${(stat.size / 1024 / 1024).toFixed(2)} MB`);
256+
console.log(` Deduplication: ${(stat.dedupRatio * 100).toFixed(2)}%`);
257+
}
258+
259+
console.log("\n=== SUMMARY ===");
260+
const totalSize = stats.reduce((sum, s) => sum + s.size, 0);
261+
const avgDedup = stats.reduce((sum, s) => sum + s.dedupRatio, 0) / stats.length;
262+
263+
console.log(`Total files: ${stats.length}`);
264+
console.log(`Total size: ${(totalSize / 1024 / 1024).toFixed(2)} MB`);
265+
console.log(`Total xorbs: ${uploadStats.xorbCount}`);
266+
console.log(`Total shards: ${uploadStats.shardCount}`);
267+
console.log(`Total xorb bytes: ${uploadStats.xorbBytes.toLocaleString("fr")} bytes`);
268+
console.log(`Total shard bytes: ${uploadStats.shardBytes.toLocaleString("fr")} bytes`);
269+
console.log(`Average deduplication: ${(avgDedup * 100).toFixed(2)}%`);
270+
271+
if (args.commit) {
272+
console.log("\n=== Committing files ===");
273+
const iterator = commitIter({
274+
repo,
275+
operations: files.map((file) => ({
276+
operation: "addOrUpdate",
277+
content: pathToFileURL(file.filepath),
278+
path: file.filename,
279+
})),
280+
accessToken: args.token,
281+
title: "Upload xet files with JS lib",
282+
xet: true,
283+
});
284+
for await (const event of iterator) {
285+
if (event.event === "fileProgress" && event.state === "hashing") {
286+
// We don't care about the hashing progress
287+
} else {
288+
console.log(event);
289+
}
290+
}
291+
292+
console.log("Done committing");
293+
}
294+
}
295+
296+
main().catch((error) => {
297+
console.error("Error:", error);
298+
process.exit(1);
299+
});

packages/hub/scripts/build-xet-wasm.sh

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,18 @@ fi
224224

225225
# copy the generated hf_xet_thin_wasm_bg.js to the hub package and hf_xet_thin_wasm_bg.wasm to the hub package
226226
cp "$CLONE_DIR/$PACKAGE/pkg/hf_xet_thin_wasm_bg.js" "./src/vendor/xet-chunk/chunker_wasm_bg.js"
227-
echo "// Generated by build-xet-wasm.sh" > "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
228-
echo "export const wasmBase64 = atob(\`" >> "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
227+
cp "$CLONE_DIR/$PACKAGE/pkg/hf_xet_thin_wasm_bg.wasm.d.ts" "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.d.ts"
228+
cat << 'EOF' > "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
229+
// Generated by build-xet-wasm.sh
230+
export const wasmBase64 = atob(
231+
`
232+
EOF
229233
base64 "$CLONE_DIR/$PACKAGE/pkg/hf_xet_thin_wasm_bg.wasm" | fold -w 100 >> "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
230234
cat << 'EOF' >> "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
231-
`)
232-
.trim()
233-
.replaceAll("\n", "");
235+
`
236+
.trim()
237+
.replaceAll("\n", "")
238+
);
234239
const wasmBinary = new Uint8Array(wasmBase64.length);
235240
for (let i = 0; i < wasmBase64.length; i++) {
236241
wasmBinary[i] = wasmBase64.charCodeAt(i);

0 commit comments

Comments
 (0)