I made a shader which gives unexpected result:
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}
Only one thread will write to resultBuffer
.
So every element should be 1.
The function any_compute()
has no side effects. I guess heavy computation causes this problem.
The results are correct most of the time. But sometimes there are unexpected results. There will be some 2 or 3 in resultBuffer
.
The full test program:
const cWGSL = `
@group(0) @binding(0) var<storage, read_write> resultBuffer: array<f32>;
const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;
const eps = 1e-6;
fn any_compute(a : vec4<f32>, b : vec4<f32>) -> vec3f
{
let dist = a.xyz - b.xyz;
let invDist = inverseSqrt(dot(dist, dist) + eps);
let invDistCube = invDist * invDist * invDist;
let s = b.w * invDistCube;
return -s * inv4PI * dist;
}
@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
let thread = id.x;
for(var c = 0u; c<1000; c++){
any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
if(thread==0){
resultBuffer[c]+=1;
}
}
}`;
async function main(){
const adapter = await navigator.gpu.requestAdapter();
const device = await adapter.requestDevice();
// to-do: check limit
console.log(adapter);
const maxGPUThread = 256;
const resultBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
});
const readBufferGPU = device.createBuffer({
size: 4000,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
});
const shaderModule = device.createShaderModule({
code: cWGSL,
})
const computePipeline = device.createComputePipeline({
layout: 'auto', // infer from shader code.
compute: {
module: shaderModule,
entryPoint: "test"
}
});
const bindGroupLayout = computePipeline.getBindGroupLayout(0);
const buffers = [resultBufferGPU];
const entries = buffers.map((b, i) => {
return { binding: i, resource: { buffer: b } };
});
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: entries
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(256, 1);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await device.queue.onSubmittedWorkDone();
await readBufferGPU.mapAsync(GPUMapMode.READ);
const arrayBuffer = readBufferGPU.getMappedRange();
const result = new Float32Array(arrayBuffer);
console.log(result);
const resultCount = {};
Array.from(new Set(result)).forEach(v => resultCount[v] = result.filter(x => x == v).length);
console.log(resultCount);
}
main();
Also a jsfiddle version. I tested it in Chrome Canary.
The count of occurrence will be printed in console. The expected output is {1: 1000}
.
The issue is I think you're confusing workgroup size and and dispatchWorkgroups
.
@workgroup_size
defines now many individual processes to run per workgroup. In other words, if you set @workgroup_size(256, 1)
in your shader, then dispatchWorkgroups(1)
will run 256 processes.
You issue the command computePassEncoder.dispatchWorkgroups(256, 1);
which means you're running 256 process, 256 times (64k processes in total)
Since thread
= id.x
and id.x
comes from workgroup_id
, that means when the first workgroup is run, workgroup_id.x will be 0 for all 256 threads. (because workgroup_id is the id of the workgroup)
So, there are 256 times that thread == 0
which means there's a race between threads trying to update resultBuffer[c]
. This is why the result is random.
There are 3 ids
local_invocation_id
(the ID with in a workgroup)workgroup_id
(the ID of a workgroup)global_invocation_id
(workgroup_id * workgroup_size + local_invocation_id)There's one convenience builtin
local_invocation_index
rowSize = workgroup_size.x
sliceSize = workgroup_size.x * workgroup_size.y
local_invocation_index =
local_invocation_id.x +
local_invocation_id.y * rowSize +
local_invocation_id.z * sliceSize
Here's some code to hopefully illustrate the point
const dispatchCount = [4, 3, 2];
const workgroupSize = [2, 3, 4];
const arrayProd = arr => arr.reduce((a, b) => a * b);
const numThreadsPerWorkgroup = arrayProd(workgroupSize);
const code = `
// NOTE!: vec3u is padded to by 4 bytes
@group(0) @binding(0) var<storage, read_write> workgroupResult: array<vec3u>;
@group(0) @binding(1) var<storage, read_write> localResult: array<vec3u>;
@group(0) @binding(2) var<storage, read_write> globalResult: array<vec3u>;
const wg_size = ${numThreadsPerWorkgroup};
@compute @workgroup_size(${workgroupSize})
fn test(
@builtin(workgroup_id) workgroup_id : vec3<u32>,
@builtin(local_invocation_id) local_invocation_id : vec3<u32>,
@builtin(global_invocation_id) global_invocation_id : vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32,
@builtin(num_workgroups) num_workgroups: vec3<u32>
) {
let workgroup_index =
workgroup_id.x +
workgroup_id.y * num_workgroups.x +
workgroup_id.z * num_workgroups.x * num_workgroups.y;
let global_invocation_index =
workgroup_index * wg_size +
local_invocation_index;
workgroupResult[global_invocation_index] = workgroup_id;
localResult[global_invocation_index] = local_invocation_id;
globalResult[global_invocation_index] = global_invocation_id;
}
`;
async function main(){
const adapter = await navigator.gpu?.requestAdapter();
const device = await adapter?.requestDevice();
if (!device) {
console.error('need WebGPU');
return;
}
const numWorkgroups = dispatchCount[0] * dispatchCount[1] * dispatchCount[2];
const numResults = numWorkgroups * numThreadsPerWorkgroup;
const size = numResults * 4 * 4; // vec3f * u32
let usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC;
const workgroupBufferGPU = device.createBuffer({size, usage});
const localBufferGPU = device.createBuffer({size, usage});
const globalBufferGPU = device.createBuffer({size, usage});
usage = GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST;
const workgroupReadBufferGPU = device.createBuffer({size, usage});
const localReadBufferGPU = device.createBuffer({size, usage});
const globalReadBufferGPU = device.createBuffer({size, usage});
const shaderModule = device.createShaderModule({code});
const computePipeline = device.createComputePipeline({
layout: 'auto',
compute: {
module: shaderModule,
entryPoint: "test"
}
});
const bindGroup = device.createBindGroup({
layout: computePipeline.getBindGroupLayout(0),
entries: [
{ binding: 0, resource: { buffer: workgroupBufferGPU }},
{ binding: 1, resource: { buffer: localBufferGPU }},
{ binding: 2, resource: { buffer: globalBufferGPU }},
],
});
const commandEncoder = device.createCommandEncoder();
const computePassEncoder = commandEncoder.beginComputePass();
computePassEncoder.setPipeline(computePipeline);
computePassEncoder.setBindGroup(0, bindGroup);
computePassEncoder.dispatchWorkgroups(...dispatchCount);
computePassEncoder.end();
commandEncoder.copyBufferToBuffer(workgroupBufferGPU, 0, workgroupReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(localBufferGPU, 0, localReadBufferGPU, 0, size);
commandEncoder.copyBufferToBuffer(globalBufferGPU, 0, globalReadBufferGPU, 0, size);
device.queue.submit([commandEncoder.finish()]);
await Promise.all([
workgroupReadBufferGPU.mapAsync(GPUMapMode.READ),
localReadBufferGPU.mapAsync(GPUMapMode.READ),
globalReadBufferGPU.mapAsync(GPUMapMode.READ),
]);
const workgroup = new Uint32Array(workgroupReadBufferGPU.getMappedRange());
const local = new Uint32Array(localReadBufferGPU.getMappedRange());
const global = new Uint32Array(globalReadBufferGPU.getMappedRange());
const get3 = (arr, i) => {
const off = i * 4;
return `${arr[off]},${arr[off + 1]},${arr[off + 2]}`;
};
for (let i = 0; i < numResults; ++i) {
if (i % numThreadsPerWorkgroup === 0) {
log(`g-index workgroup local global dispatch: ${i / numThreadsPerWorkgroup}`);
}
log(`${i.toString().padStart(3)}: ${get3(workgroup, i)} ${get3(local, i)} ${get3(global, i)}`)
}
}
function log(...args) {
const elem = document.createElement('pre');
elem.textContent = args.join(' ');
document.body.appendChild(elem);
}
main();
pre { margin: 0; }
notice in the first dispatch global
and local
are the same. For the rest of the dispatches they aren't
Also notice, workgroup_id is the same for all threads in the same workgroup.
I don't know all the reasons why it's this way but, within a single workgroup the threads can share var<workgroup>
variables.
note: you do not need to call await device.queue.onSubmittedWorkDone
. From the spec:
Under step 4 of mapAsync:
after the completion of currently-enqueued operations that use this,
...
Issue the map success steps on the contentTimeline.