|
| 1 | +import { uploadShards } from "../src/utils/uploadShards.js"; |
| 2 | +import { sha256 } from "../src/utils/sha256.js"; |
| 3 | +import { parseArgs } from "node:util"; |
| 4 | +import { tmpdir } from "node:os"; |
| 5 | +import { join } from "node:path"; |
| 6 | +import { writeFile, readFile, stat, mkdir } from "node:fs/promises"; |
| 7 | +import type { RepoId } from "../src/types/public.js"; |
| 8 | +import { toRepoId } from "../src/utils/toRepoId.js"; |
| 9 | +import { commitIter } from "../src/index.js"; |
| 10 | +import { pathToFileURL } from "node:url"; |
| 11 | + |
| 12 | +/** |
| 13 | + * This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo. |
| 14 | + * It prints the dedup % and the statistics |
| 15 | + * |
| 16 | + * Usage: |
| 17 | + * |
| 18 | + * pnpm --filter hub bench -t <write token> -r <xet repo> |
| 19 | + * pnpm --filter hub bench -t <write token> -r <xet repo> --commit # Actually upload files |
| 20 | + */ |
| 21 | + |
| 22 | +const FILES_TO_DOWNLOAD = [ |
| 23 | + { |
| 24 | + url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true", |
| 25 | + filename: "64-8bits.tflite", |
| 26 | + }, |
| 27 | + { |
| 28 | + url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-fp16.tflite?download=true", |
| 29 | + filename: "64-fp16.tflite", |
| 30 | + }, |
| 31 | +]; |
| 32 | + |
| 33 | +async function downloadFileIfNotExists(url: string, filepath: string): Promise<void> { |
| 34 | + try { |
| 35 | + await stat(filepath); |
| 36 | + console.log(`File ${filepath} already exists, skipping download`); |
| 37 | + return; |
| 38 | + } catch { |
| 39 | + // File doesn't exist, proceed with download |
| 40 | + } |
| 41 | + |
| 42 | + console.log(`Downloading ${url} to ${filepath}...`); |
| 43 | + const response = await fetch(url); |
| 44 | + if (!response.ok) { |
| 45 | + throw new Error(`Failed to download ${url}: ${response.status} ${response.statusText}`); |
| 46 | + } |
| 47 | + |
| 48 | + const buffer = await response.arrayBuffer(); |
| 49 | + await writeFile(filepath, new Uint8Array(buffer)); |
| 50 | + console.log(`Downloaded ${filepath} (${buffer.byteLength} bytes)`); |
| 51 | +} |
| 52 | + |
| 53 | +async function* createFileSource( |
| 54 | + files: Array<{ filepath: string; filename: string }> |
| 55 | +): AsyncGenerator<{ content: Blob; path: string; sha256: string }> { |
| 56 | + for (const file of files) { |
| 57 | + console.log(`Processing ${file.filename}...`); |
| 58 | + const buffer = await readFile(file.filepath); |
| 59 | + const blob = new Blob([buffer]); |
| 60 | + |
| 61 | + // Calculate sha256 |
| 62 | + console.log(`Calculating SHA256 for ${file.filename}...`); |
| 63 | + const sha256Iterator = sha256(blob, { useWebWorker: false }); |
| 64 | + let res: IteratorResult<number, string>; |
| 65 | + do { |
| 66 | + res = await sha256Iterator.next(); |
| 67 | + } while (!res.done); |
| 68 | + const sha256Hash = res.value; |
| 69 | + |
| 70 | + console.log(`SHA256 for ${file.filename}: ${sha256Hash}`); |
| 71 | + yield { |
| 72 | + content: blob, |
| 73 | + path: file.filename, |
| 74 | + sha256: sha256Hash, |
| 75 | + }; |
| 76 | + } |
| 77 | +} |
| 78 | + |
| 79 | +function getBodySize(body: RequestInit["body"]): string { |
| 80 | + if (!body) { |
| 81 | + return "no body"; |
| 82 | + } |
| 83 | + if (body instanceof ArrayBuffer) { |
| 84 | + return body.byteLength.toString(); |
| 85 | + } |
| 86 | + if (body instanceof Blob) { |
| 87 | + return "blob"; |
| 88 | + } |
| 89 | + if (body instanceof Uint8Array) { |
| 90 | + return body.byteLength.toString(); |
| 91 | + } |
| 92 | + return "unknown size"; |
| 93 | +} |
| 94 | + |
| 95 | +function createMockFetch(): { |
| 96 | + fetch: typeof fetch; |
| 97 | + getStats: () => { xorbCount: number; shardCount: number; xorbBytes: number; shardBytes: number }; |
| 98 | +} { |
| 99 | + let xorbCount = 0; |
| 100 | + let shardCount = 0; |
| 101 | + let xorbBytes = 0; |
| 102 | + let shardBytes = 0; |
| 103 | + |
| 104 | + const mockFetch = async function (input: string | URL | Request, init?: RequestInit): Promise<Response> { |
| 105 | + const url = typeof input === "string" ? input : input.toString(); |
| 106 | + |
| 107 | + // Mock successful responses for xorb and shard uploads |
| 108 | + if (url.includes("/xorb/")) { |
| 109 | + xorbCount++; |
| 110 | + const bodySize = getBodySize(init?.body); |
| 111 | + xorbBytes += parseInt(bodySize); |
| 112 | + console.log(`[MOCK] Xorb upload ${xorbCount}: ${init?.method || "GET"} ${url} (${bodySize})`); |
| 113 | + |
| 114 | + return new Response(null, { |
| 115 | + status: 200, |
| 116 | + statusText: "OK", |
| 117 | + }); |
| 118 | + } |
| 119 | + |
| 120 | + if (url.endsWith("/shard")) { |
| 121 | + shardCount++; |
| 122 | + const bodySize = getBodySize(init?.body); |
| 123 | + shardBytes += parseInt(bodySize); |
| 124 | + console.log(`[MOCK] Shard upload ${shardCount}: ${init?.method || "GET"} ${url} (${bodySize})`); |
| 125 | + |
| 126 | + return new Response(null, { |
| 127 | + status: 200, |
| 128 | + statusText: "OK", |
| 129 | + }); |
| 130 | + } |
| 131 | + |
| 132 | + // For other requests, use real fetch |
| 133 | + return fetch(input, init).then((res) => { |
| 134 | + console.log(`[real] ${res.status} ${res.statusText} ${url} ${res.headers.get("content-length")}`); |
| 135 | + return res; |
| 136 | + }); |
| 137 | + }; |
| 138 | + |
| 139 | + return { |
| 140 | + fetch: mockFetch, |
| 141 | + getStats: () => ({ xorbCount, shardCount, xorbBytes, shardBytes }), |
| 142 | + }; |
| 143 | +} |
| 144 | + |
| 145 | +async function main() { |
| 146 | + const { values: args } = parseArgs({ |
| 147 | + options: { |
| 148 | + token: { |
| 149 | + type: "string", |
| 150 | + short: "t", |
| 151 | + }, |
| 152 | + repo: { |
| 153 | + type: "string", |
| 154 | + short: "r", |
| 155 | + }, |
| 156 | + commit: { |
| 157 | + type: "boolean", |
| 158 | + short: "c", |
| 159 | + default: false, |
| 160 | + }, |
| 161 | + }, |
| 162 | + }); |
| 163 | + |
| 164 | + if (!args.token || !args.repo) { |
| 165 | + console.error("Usage: pnpm --filter hub bench -t <write token> -r <xet repo>"); |
| 166 | + console.error("Example: pnpm --filter hub bench -t hf_... -r myuser/myrepo"); |
| 167 | + process.exit(1); |
| 168 | + } |
| 169 | + |
| 170 | + // Setup temp directory |
| 171 | + const tempDir = tmpdir(); |
| 172 | + const downloadDir = join(tempDir, "hf-bench-downloads"); |
| 173 | + |
| 174 | + // Ensure download directory exists |
| 175 | + await mkdir(downloadDir, { recursive: true }); |
| 176 | + |
| 177 | + // Download files |
| 178 | + const files: Array<{ filepath: string; filename: string }> = []; |
| 179 | + |
| 180 | + for (const fileInfo of FILES_TO_DOWNLOAD) { |
| 181 | + const filepath = join(downloadDir, fileInfo.filename); |
| 182 | + await downloadFileIfNotExists(fileInfo.url, filepath); |
| 183 | + files.push({ filepath, filename: fileInfo.filename }); |
| 184 | + } |
| 185 | + |
| 186 | + // Parse repo |
| 187 | + const repoName = args.repo; |
| 188 | + |
| 189 | + const repo: RepoId = toRepoId(repoName); |
| 190 | + |
| 191 | + // Create mock fetch |
| 192 | + const mockFetchObj = createMockFetch(); |
| 193 | + |
| 194 | + // Setup upload parameters |
| 195 | + const uploadParams = { |
| 196 | + accessToken: args.token, |
| 197 | + hubUrl: "https://huggingface.co", |
| 198 | + customFetch: mockFetchObj.fetch, |
| 199 | + repo, |
| 200 | + rev: "main", |
| 201 | + }; |
| 202 | + |
| 203 | + // Track statistics |
| 204 | + const stats: Array<{ |
| 205 | + filename: string; |
| 206 | + size: number; |
| 207 | + dedupRatio: number; |
| 208 | + }> = []; |
| 209 | + |
| 210 | + console.log("\n=== Starting upload simulation ==="); |
| 211 | + |
| 212 | + // Process files through uploadShards |
| 213 | + const fileSource = createFileSource(files); |
| 214 | + |
| 215 | + for await (const event of uploadShards(fileSource, uploadParams)) { |
| 216 | + switch (event.event) { |
| 217 | + case "file": { |
| 218 | + console.log(`\n📁 Processed file: ${event.path}`); |
| 219 | + console.log(` SHA256: ${event.sha256}`); |
| 220 | + console.log(` Dedup ratio: ${(event.dedupRatio * 100).toFixed(2)}%`); |
| 221 | + |
| 222 | + // Find the file size |
| 223 | + const file = files.find((f) => f.filename === event.path); |
| 224 | + if (file) { |
| 225 | + const fileStats = await stat(file.filepath); |
| 226 | + |
| 227 | + stats.push({ |
| 228 | + filename: event.path, |
| 229 | + size: fileStats.size, |
| 230 | + dedupRatio: event.dedupRatio, |
| 231 | + }); |
| 232 | + } |
| 233 | + break; |
| 234 | + } |
| 235 | + |
| 236 | + case "fileProgress": { |
| 237 | + const progress = (event.progress * 100).toFixed(1); |
| 238 | + console.log(` 📈 Progress for ${event.path}: ${progress}%`); |
| 239 | + break; |
| 240 | + } |
| 241 | + } |
| 242 | + } |
| 243 | + |
| 244 | + // Get actual upload counts from the mock fetch |
| 245 | + const uploadStats = mockFetchObj.getStats(); |
| 246 | + console.log(`\n📊 Actual upload counts: ${uploadStats.xorbCount} xorbs, ${uploadStats.shardCount} shards`); |
| 247 | + |
| 248 | + // Output final statistics |
| 249 | + console.log("\n=== BENCHMARK RESULTS ==="); |
| 250 | + console.log("File Statistics:"); |
| 251 | + console.log("================"); |
| 252 | + |
| 253 | + for (const stat of stats) { |
| 254 | + console.log(`\n📄 ${stat.filename}:`); |
| 255 | + console.log(` Size: ${(stat.size / 1024 / 1024).toFixed(2)} MB`); |
| 256 | + console.log(` Deduplication: ${(stat.dedupRatio * 100).toFixed(2)}%`); |
| 257 | + } |
| 258 | + |
| 259 | + console.log("\n=== SUMMARY ==="); |
| 260 | + const totalSize = stats.reduce((sum, s) => sum + s.size, 0); |
| 261 | + const avgDedup = stats.reduce((sum, s) => sum + s.dedupRatio, 0) / stats.length; |
| 262 | + |
| 263 | + console.log(`Total files: ${stats.length}`); |
| 264 | + console.log(`Total size: ${(totalSize / 1024 / 1024).toFixed(2)} MB`); |
| 265 | + console.log(`Total xorbs: ${uploadStats.xorbCount}`); |
| 266 | + console.log(`Total shards: ${uploadStats.shardCount}`); |
| 267 | + console.log(`Total xorb bytes: ${uploadStats.xorbBytes.toLocaleString("fr")} bytes`); |
| 268 | + console.log(`Total shard bytes: ${uploadStats.shardBytes.toLocaleString("fr")} bytes`); |
| 269 | + console.log(`Average deduplication: ${(avgDedup * 100).toFixed(2)}%`); |
| 270 | + |
| 271 | + if (args.commit) { |
| 272 | + console.log("\n=== Committing files ==="); |
| 273 | + const iterator = commitIter({ |
| 274 | + repo, |
| 275 | + operations: files.map((file) => ({ |
| 276 | + operation: "addOrUpdate", |
| 277 | + content: pathToFileURL(file.filepath), |
| 278 | + path: file.filename, |
| 279 | + })), |
| 280 | + accessToken: args.token, |
| 281 | + title: "Upload xet files with JS lib", |
| 282 | + xet: true, |
| 283 | + }); |
| 284 | + for await (const event of iterator) { |
| 285 | + if (event.event === "fileProgress" && event.state === "hashing") { |
| 286 | + // We don't care about the hashing progress |
| 287 | + } else { |
| 288 | + console.log(event); |
| 289 | + } |
| 290 | + } |
| 291 | + |
| 292 | + console.log("Done committing"); |
| 293 | + } |
| 294 | +} |
| 295 | + |
| 296 | +main().catch((error) => { |
| 297 | + console.error("Error:", error); |
| 298 | + process.exit(1); |
| 299 | +}); |
0 commit comments