I'm trying use Tier 2 Metal argument buffers for a compute shader and I'm getting strange behavior. On my CPU side setup, I create two buffers and assign their GPU address to a third argument buffer. The intent is read from one and write to the other. As a sanity check, I make a fourth test the old way binding arguments explicitly. The host side code is here.
#import <Metal/Metal.h>
#include <iostream>
#include <random>
int main(int argc, const char * argv[]) {
std::vector<float> temp = std::vector<float> (1024*100, 5.0);
id<MTLDevice> device = [MTLCopyAllDevices() objectAtIndex:0];
id<MTLCommandQueue> queue = [device newCommandQueue];
id<MTLLibrary> library = [device newDefaultLibrary];
id<MTLFunction> function = [library newFunctionWithName:@"threadgroup_test"];
MTLComputePipelineDescriptor *compute = [MTLComputePipelineDescriptor new];
compute.threadGroupSizeIsMultipleOfThreadExecutionWidth = YES;
compute.computeFunction = function;
compute.maxTotalThreadsPerThreadgroup = 1024;
compute.buffers[0].mutability = MTLMutabilityImmutable;
compute.buffers[1].mutability = MTLMutabilityImmutable;
compute.buffers[2].mutability = MTLMutabilityMutable;
NSError *error;
id<MTLComputePipelineState> state = [device newComputePipelineStateWithDescriptor:compute
options:MTLPipelineOptionNone
reflection:NULL
error:&error];
if (error) {
NSLog(@"%@", error);
}
id<MTLBuffer> buffer1 = [device newBufferWithBytes:temp.data()
length:temp.size()*sizeof(float)
options:MTLResourceStorageModeShared |
MTLResourceCPUCacheModeWriteCombined];
id<MTLBuffer> buffer2 = [device newBufferWithLength:temp.size()*sizeof(float)
options:MTLResourceStorageModeShared];
std::array<uint64_t, 2> args = {
buffer1.gpuAddress,
buffer2.gpuAddress
};
id<MTLBuffer> buffer3 = [device newBufferWithBytes:args.data()
length:args.size()*sizeof(uint64_t)
options:MTLResourceStorageModeShared |
MTLResourceCPUCacheModeWriteCombined];
id<MTLBuffer> buffer4 = [device newBufferWithLength:temp.size()*sizeof(float)
options:MTLResourceStorageModeShared];
NSUInteger threads_per_group = state.maxTotalThreadsPerThreadgroup;
NSUInteger thread_groups = temp.size()/threads_per_group;
id<MTLCommandBuffer> command_buffer = [queue commandBuffer];
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDispatchType:MTLDispatchTypeSerial];
[encoder setComputePipelineState:state];
[encoder setBuffer:buffer3 offset:0 atIndex:0];
[encoder setBuffer:buffer1 offset:0 atIndex:1];
[encoder setBuffer:buffer4 offset:0 atIndex:2];
[encoder dispatchThreadgroups:MTLSizeMake(thread_groups, 1, 1)
threadsPerThreadgroup:MTLSizeMake(threads_per_group, 1, 1)];
[encoder endEncoding];
[command_buffer commit];
[command_buffer waitUntilCompleted];
const float *data1 = static_cast<float *> (buffer2.contents);
const float *data2 = static_cast<float *> (buffer4.contents);
for (size_t i = 0, ie = temp.size(); i < ie; i++) {
std::cout << i << " " << temp[i] << " " << *(data1 + i) << " " << *(data2 + i) << std::endl;
}
return 0;
}
My compute kernels just tries to assign the contents of buffer1 to buffer2 and buffer4. Where one is writing to the argument buffer and the other is reading from it.
#include <metal_stdlib>
using namespace metal;
struct kernel_args {
constant float *input;
device float *output;
};
kernel void threadgroup_test(constant kernel_args &args [[buffer(0)]],
constant float *input [[buffer(1)]],
device float *output [[buffer(2)]],
uint index [[thread_position_in_grid]]) {
args.output[index] = input[index];
output[index] = args.input[index];
}
This is where things are going strange. When I run the above code, the output shows that buffer4 was successfully written to from the contents of buffer 1, but buffer2 does not contain the values.
...
102396 5 0 5
102397 5 0 5
102398 5 0 5
102399 5 0 5
If I comment out the line,
#include <metal_stdlib>
using namespace metal;
struct kernel_args {
constant float *input;
device float *output;
};
kernel void threadgroup_test(constant kernel_args &args [[buffer(0)]],
constant float *input [[buffer(1)]],
device float *output [[buffer(2)]],
uint index [[thread_position_in_grid]]) {
// args.output[index] = input[index];
output[index] = args.input[index];
}
It no longer writes to the buffer4.
...
102396 5 0 0
102397 5 0 0
102398 5 0 0
102399 5 0 0
Changing it from kernel_args
from constant
address space to device
has no effect.
There is no strange behavior here. You are missing a useResource:
on buffer2
. Everything that you use needs to be resident in memory. buffer1
, buffer3
, and buffer4
are used directly, by being bound to MTLComputeCommandEncoder
, but buffer2
is only used as a gpuAddress
in MTLBuffer
. Therefore, you need to mark it as resident, you can do that by calling [encoder useResource:buffer2 usage:MTLResourceUsageWrite]
.