-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
enhancementNew feature or requestNew feature or request
Description
running on macbook pro 2015
the command system_profiler SPDisplaysDataType | grep "Chipset Model" :
Chipset Model: Intel Iris Pro
Chipset Model: AMD Radeon R9 M370X
My code benchmark cpu and gpu. And it work great. But currently my igpu is the one doing the computing. i'd like to target specifically the AMD Radeon R9 which is the discrete
import { setupGlobals } from 'bun-webgpu';
setupGlobals();
// CPU Matrix Multiplication
function matrixMultiplyCPU(
matrixA: Float32Array,
matrixB: Float32Array,
M: number,
K: number,
N: number
): Float32Array {
const result = new Float32Array(M * N);
for (let row = 0; row < M; row++) {
for (let col = 0; col < N; col++) {
let sum = 0;
for (let i = 0; i < K; i++) {
sum += matrixA[row * K + i] * matrixB[i * N + col];
}
result[row * N + col] = sum;
}
}
return result;
}
// GPU Matrix Multiplication
async function matrixMultiplyGPU(
device: GPUDevice,
matrixA: Float32Array,
matrixB: Float32Array,
M: number,
K: number,
N: number
): Promise<Float32Array> {
const shaderCode = `
@group(0) @binding(0) var<storage, read> matrixA: array<f32>;
@group(0) @binding(1) var<storage, read> matrixB: array<f32>;
@group(0) @binding(2) var<storage, read_write> result: array<f32>;
@compute @workgroup_size(16, 16)
fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
let M = ${M}u;
let K = ${K}u;
let N = ${N}u;
let row = global_id.y;
let col = global_id.x;
if (row >= M || col >= N) {
return;
}
var sum = 0.0;
for (var i = 0u; i < K; i = i + 1u) {
sum = sum + matrixA[row * K + i] * matrixB[i * N + col];
}
result[row * N + col] = sum;
}
`;
// Create buffers
const bufferA = device.createBuffer({
size: matrixA.byteLength,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
mappedAtCreation: true,
});
new Float32Array(bufferA.getMappedRange()).set(matrixA);
bufferA.unmap();
const bufferB = device.createBuffer({
size: matrixB.byteLength,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
mappedAtCreation: true,
});
new Float32Array(bufferB.getMappedRange()).set(matrixB);
bufferB.unmap();
const resultSize = M * N * Float32Array.BYTES_PER_ELEMENT;
const bufferResult = device.createBuffer({
size: resultSize,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
});
const stagingBuffer = device.createBuffer({
size: resultSize,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
});
// Create shader and pipeline
const shaderModule = device.createShaderModule({ code: shaderCode });
const bindGroupLayout = device.createBindGroupLayout({
entries: [
{
binding: 0,
visibility: GPUShaderStage.COMPUTE,
buffer: { type: "read-only-storage" as GPUBufferBindingType },
},
{
binding: 1,
visibility: GPUShaderStage.COMPUTE,
buffer: { type: "read-only-storage" as GPUBufferBindingType },
},
{
binding: 2,
visibility: GPUShaderStage.COMPUTE,
buffer: { type: "storage" as GPUBufferBindingType },
},
],
});
const computePipeline = device.createComputePipeline({
layout: device.createPipelineLayout({
bindGroupLayouts: [bindGroupLayout],
}),
compute: {
module: shaderModule,
entryPoint: "main",
},
});
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: [
{ binding: 0, resource: { buffer: bufferA } },
{ binding: 1, resource: { buffer: bufferB } },
{ binding: 2, resource: { buffer: bufferResult } },
],
});
// Execute
const commandEncoder = device.createCommandEncoder();
const passEncoder = commandEncoder.beginComputePass();
passEncoder.setPipeline(computePipeline);
passEncoder.setBindGroup(0, bindGroup);
passEncoder.dispatchWorkgroups(Math.ceil(N / 16), Math.ceil(M / 16));
passEncoder.end();
commandEncoder.copyBufferToBuffer(
bufferResult,
0,
stagingBuffer,
0,
resultSize
);
device.queue.submit([commandEncoder.finish()]);
// Read results
await stagingBuffer.mapAsync(GPUMapMode.READ);
const resultArray = new Float32Array(
new Float32Array(stagingBuffer.getMappedRange()).slice()
);
stagingBuffer.unmap();
// Cleanup
bufferA.destroy();
bufferB.destroy();
bufferResult.destroy();
stagingBuffer.destroy();
return resultArray;
}
function formatMatrix(matrix: Float32Array, rows: number, cols: number): string {
let result = "";
for (let i = 0; i < rows; i++) {
const row = [];
for (let j = 0; j < cols; j++) {
row.push(matrix[i * cols + j].toFixed(2).padStart(8));
}
result += row.join(" ") + "\n";
}
return result;
}
function arraysEqual(a: Float32Array, b: Float32Array, tolerance = 0.0001): boolean {
if (a.length !== b.length) return false;
for (let i = 0; i < a.length; i++) {
if (Math.abs(a[i] - b[i]) > tolerance) return false;
}
return true;
}
async function runBenchmark() {
// Test with different matrix sizes - going MUCH larger
const sizes = [
{ M: 10, K: 10, N: 10 },
{ M: 100, K: 100, N: 100 },
// { M: 500, K: 500, N: 500 },
// { M: 1000, K: 1000, N: 1000 },
// { M: 2000, K: 2000, N: 2000 },
//{ M: 3000, K: 3000, N: 3000 },
//{ M: 4000, K: 4000, N: 4000 },
];
// Initialize GPU
const adapter = await navigator.gpu.requestAdapter();
if (!adapter) {
throw new Error("No GPU adapter found");
}
console.log(adapter);
const device = await adapter.requestDevice();
if (!device) {
throw new Error("Failed to get GPU device");
}
console.log("device");
console.log(device);
console.log("Running benchmarks...\n");
for (const { M, K, N } of sizes) {
console.log(`=== Matrix Size: ${M}×${K} × ${K}×${N} (${M * K + K * N} elements) ===`);
// Generate random matrices
const matrixA = new Float32Array(M * K);
const matrixB = new Float32Array(K * N);
for (let i = 0; i < matrixA.length; i++) {
matrixA[i] = Math.random() * 10;
}
for (let i = 0; i < matrixB.length; i++) {
matrixB[i] = Math.random() * 10;
}
// Benchmark CPU
const cpuStart = performance.now();
const cpuResult = matrixMultiplyCPU(matrixA, matrixB, M, K, N);
const cpuTime = performance.now() - cpuStart;
// Benchmark GPU (including data transfer)
const gpuStart = performance.now();
const gpuResult = await matrixMultiplyGPU(device, matrixA, matrixB, M, K, N);
const gpuTime = performance.now() - gpuStart;
// Verify results match
const resultsMatch = arraysEqual(cpuResult, gpuResult);
const speedup = cpuTime / gpuTime;
const winner = speedup > 1 ? "🏆 GPU wins!" : "CPU wins";
console.log(`CPU Time: ${cpuTime.toFixed(3)}ms`);
console.log(`GPU Time: ${gpuTime.toFixed(3)}ms (including data transfer)`);
console.log(`Speedup: ${speedup.toFixed(2)}x - ${winner}`);
console.log(`Results match: ${resultsMatch ? '✓' : '✗'}`);
if (!resultsMatch && M <= 100) {
// Show a few sample values for debugging
console.log(`Sample CPU values: [${Array.from(cpuResult.slice(0, 5)).map(v => v.toFixed(2)).join(', ')}...]`);
console.log(`Sample GPU values: [${Array.from(gpuResult.slice(0, 5)).map(v => v.toFixed(2)).join(', ')}...]`);
}
console.log("");
}
device.destroy();
console.log("\nNote: GPU overhead includes buffer creation, data transfer,");
console.log("shader compilation, and result retrieval. For very large matrices");
console.log("or repeated operations, GPU will significantly outperform CPU.");
}
runBenchmark().catch(console.error);```
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request