I was wondering how to inspect or dump the NVIDIA GPU global memory contents corresponding to some arbitrary address but not invalid (not necessarily something returned by CUDA's memory management APIs), like some address pointing to some symbol through cuda-gdb may be.
Let us take a small example program to illustrate the situation:
/*********************************post.cu***********************************/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define CUDA_SAFECALL(call) \
{ \
call; \
cudaError err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf( \
stderr, \
"Cuda error in function '%s' file '%s' in line %i : %s.\n", \
#call, __FILE__, __LINE__, cudaGetErrorString(err)); \
fflush(stderr); \
exit(EXIT_FAILURE); \
} \
}
// CUDA kernel that adds two vectors, each thread handles one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) c[id] = a[id] + b[id];
}
// CUDA kernel that doubles the elements of a vector
__global__ void vecDouble(double *a, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) a[id] *= 2;
}
// CUDA kernel that halves the elements of a vector
__global__ void vecHalve(double *a, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) a[id] /= 2;
}
typedef void (*fp)(double *, double *, double *, int);
typedef void (*fp1)(double *, int);
__device__ fp kernelPtrvecAdd = vecAdd;
__device__ fp1 kernelPtrvecDouble = vecDouble;
__device__ fp1 kernelPtrvecHalve = vecHalve;
// Parent kernel that launches vecAdd dynamically
__global__ void parentKernel(fp kernelPtr, double *d_a, double *d_b, double *d_c, int n) {
int blockSize = 1024;
int gridSize = (int)ceil((float)n / blockSize);
printf("Parent kernel:: kernelPtr: %p\n", kernelPtr);
// Launch the vecAdd kernel dynamically from the device
kernelPtr<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
}
__global__ void breakpointKernel(){
printf("Breakpoint Kernel\n");
}
void breakpointFunction(){
printf("Breakpoint Function\n");
}
int main(int argc, char *argv[]) {
// Size of vectors
int n = 10000;
if (argc > 1) n = atoi(argv[1]);
// Host input vectors
double *h_a, *h_b, *h_c;
// Device input vectors
double *d_a, *d_b, *d_c;
// Size in bytes of each vector
size_t bytes = n * sizeof(double);
// Allocate memory for each vector on host
h_a = (double *)malloc(bytes);
h_b = (double *)malloc(bytes);
h_c = (double *)malloc(bytes);
// Initialize vectors on host
for (int i = 0; i < n; i++) {
h_a[i] = sin(i) * sin(i);
h_b[i] = cos(i) * cos(i);
h_c[i] = 0;
}
fp h_kernelPtrvecAdd;
fp1 h_kernelPtrvecDouble;
fp1 h_kernelPtrvecHalve;
CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecAdd, kernelPtrvecAdd, sizeof(fp)));
CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecDouble, kernelPtrvecDouble, sizeof(fp1)));
CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecHalve, kernelPtrvecHalve, sizeof(fp1)));
printf("Device vecAdd Ptr: %p\n", h_kernelPtrvecAdd);
printf("Host vecAdd Ptr: %p\n", vecAdd);
printf("Device vecDouble Ptr: %p\n", h_kernelPtrvecDouble);
printf("Host vecDouble Ptr: %p\n", vecDouble);
printf("Device vecHalve Ptr: %p\n", h_kernelPtrvecHalve);
printf("Host vecHalve Ptr: %p\n", vecHalve);
// Create CUDA stream
cudaStream_t stream;
cudaStreamCreate(&stream);
// Allocate memory for each vector on GPU using async memory allocation
cudaMallocAsync(&d_a, bytes, stream);
cudaMallocAsync(&d_b, bytes, stream);
cudaMallocAsync(&d_c, bytes, stream);
cudaStreamSynchronize(stream);
printf("d_a: %p\n", d_a);
printf("d_b: %p\n", d_b);
printf("d_c: %p\n", d_c);
// Copy host vectors to device using async memory copy
cudaMemcpyAsync(d_a, h_a, bytes, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, h_b, bytes, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
// Launch parent kernel that launches vecAdd dynamically
(parentKernel<<<1, 1, 0, stream>>>(h_kernelPtrvecAdd, d_a, d_b, d_c, n));
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n / blockSize);
vecDouble<<<gridSize, blockSize, 0, stream>>>(d_a, n);
vecDouble<<<gridSize, blockSize, 0, stream>>>(d_b, n);
vecAdd<<<gridSize, blockSize, 0, stream>>>(d_a, d_b, d_c, n);
vecHalve<<<gridSize, blockSize, 0, stream>>>(d_c, n);
// Synchronize the stream to ensure everything is done
cudaStreamSynchronize(stream);
// Copy array back to host using async memory copy
cudaMemcpyAsync(h_c, d_c, bytes, cudaMemcpyDeviceToHost, stream);
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (int i = 0; i < n; i++) sum += h_c[i];
printf("Final sum = %f; sum/n = %f (should be ~1)\n", sum, sum / n);
breakpointKernel<<<1, 1, 0, stream>>>();
breakpointFunction();
// Release host memory
free(h_a);
free(h_b);
free(h_c);
// Release device memory using async memory deallocation
cudaFreeAsync(d_a, stream);
cudaFreeAsync(d_b, stream);
cudaFreeAsync(d_c, stream);
cudaStreamDestroy(stream);
return 0;
}
$ nvcc -g -G -o post post.cu -rdc=true
Now let's attach the program to cuda-gdb:
$ cuda-gdb ./post
(cuda-gdb) break breakpointKernel()
Breakpoint 1 at 0xcf30: file post.cu, line 53.
(cuda-gdb) break breakpointFunction()
Breakpoint 2 at 0xbe2f: file post.cu, line 58.
(cuda-gdb) run
Starting program: post
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff54c0000 (LWP 22501)]
[Detaching after fork from child process 22502]
[New Thread 0x7ffff4993000 (LWP 22511)]
[New Thread 0x7fffe89ff000 (LWP 22512)]
[New Thread 0x7fffe3fff000 (LWP 22513)]
Device vecAdd Ptr: 0x7fffceaff800
Host vecAdd Ptr: 0x555555560957
Device vecDouble Ptr: 0x7fffceaff300
Host vecDouble Ptr: 0x555555560ac7
Device vecHalve Ptr: 0x7fffceafee00
Host vecHalve Ptr: 0x555555560c29
d_a: 0x302000000
d_b: 0x302013a00
d_c: 0x302027400
Parent kernel:: kernelPtr: 0x7fffceaff800
Final sum = 10000.000000; sum/n = 1.000000 (should be ~1)
[Switching focus to CUDA kernel 0, grid 8, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 1, lane 0]
Thread 1 "post" hit Breakpoint 1, breakpointKernel<<<(1,1,1),(1,1,1)>>> () at post.cu:54
54 printf("Breakpoint Kernel\n");
So, we are inside a kernel.
Trying to dump the memory contents of array a
on the device (d_a
):
(cuda-gdb) x /10x (@global void*) 0x302000000
0x302000000: 0x00000000 0x00000000 0x95d4dc81 0x3ff6a889
0x302000010: 0x036d9261 0x3ffa7553 0x687dd0a8 0x3fa4648f
0x302000020: 0xd7ec65f2 0x3ff253f7
We get some value. Now trying to access the memory content at the device pointer of vecAdd
gives:
(cuda-gdb) x /10x (@global void*) 0x7fffceaff800
0x7fffceaff800: Error: Failed to read 4 bytes of global memory from 0x7fffceaff800
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
(cuda-gdb) x /10bx (@global void*) 0x7fffceaff800
0x7fffceaff800: Error: Failed to read 1 bytes of global memory from 0x7fffceaff800
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
(cuda-gdb) x /10i (@global void*) 0x7fffceaff800
0x7fffceaff800:
0x7fffceaff801:
0x7fffceaff802:
0x7fffceaff803:
0x7fffceaff804:
0x7fffceaff805:
0x7fffceaff806:
0x7fffceaff807:
0x7fffceaff808:
0x7fffceaff809:
Let us continue and hit the breakpoint in the host function:
cuda-gdb) continue
Continuing.
Breakpoint Kernel
Thread 1 "post" hit Breakpoint 2, breakpointFunction () at post.cu:58
58 printf("Breakpoint Function\n");
(cuda-gdb)
Now let us try to access the content at the host pointer of vecAdd
:
(cuda-gdb) x /10i 0x555555560957
0x555555560957 <_Z6vecAddPdS_S_i>: endbr64
0x55555556095b <_Z6vecAddPdS_S_i+4>: push %rbp
0x55555556095c <_Z6vecAddPdS_S_i+5>: mov %rsp,%rbp
0x55555556095f <_Z6vecAddPdS_S_i+8>: sub $0x20,%rsp
0x555555560963 <_Z6vecAddPdS_S_i+12>: mov %rdi,-0x8(%rbp)
0x555555560967 <_Z6vecAddPdS_S_i+16>: mov %rsi,-0x10(%rbp)
0x55555556096b <_Z6vecAddPdS_S_i+20>: mov %rdx,-0x18(%rbp)
0x55555556096f <_Z6vecAddPdS_S_i+24>: mov %ecx,-0x1c(%rbp)
0x555555560972 <_Z6vecAddPdS_S_i+27>: mov -0x1c(%rbp),%ecx
0x555555560975 <_Z6vecAddPdS_S_i+30>: mov -0x18(%rbp),%rdx
The above are valid x86_64 instructions.
Now trying to get memory content of the device pointer from the host function breakpoint:
(cuda-gdb) x /10i (@global void*) 0x7fffceaff800
0x7fffceaff800: add %al,(%rax)
0x7fffceaff802: add %al,(%rax)
0x7fffceaff804: add %al,(%rax)
0x7fffceaff806: add %al,(%rax)
0x7fffceaff808: add %al,(%rax)
0x7fffceaff80a: add %al,(%rax)
0x7fffceaff80c: add %al,(%rax)
0x7fffceaff80e: add %al,(%rax)
0x7fffceaff810: add %al,(%rax)
0x7fffceaff812: add %al,(%rax)
I get some weird stretch of add
instruction. (which is exactly the same for other kernels like vecDouble or vecHalf)
Trying out for vecDouble
:
(cuda-gdb) x /10i (@global void*) 0x7fffceaff300
0x7fffceaff300: add %al,(%rax)
0x7fffceaff302: add %al,(%rax)
0x7fffceaff304: add %al,(%rax)
0x7fffceaff306: add %al,(%rax)
0x7fffceaff308: add %al,(%rax)
0x7fffceaff30a: add %al,(%rax)
0x7fffceaff30c: add %al,(%rax)
0x7fffceaff30e: add %al,(%rax)
0x7fffceaff310: add %al,(%rax)
0x7fffceaff312: add %al,(%rax)
These weird add
instructions seem erroneous to me anyway.
My question is, how could one dump the memory content of some arbitrary (but valid) device side pointer corresponding to some symbol or around that?
cuda-gdb version:
NVIDIA (R) CUDA Debugger
12.0 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 12.1
Now trying to access the memory content at the device pointer of vecAdd gives:
(cuda-gdb) x /10x (@global void*) 0x7fffceaff800 0x7fffceaff800: Error: Failed to read 4 bytes of global memory from > 0x7fffceaff800 , error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
The CUDA driver might inject code after loading user binaries, and debugging tools do not support printing it as of today. However. you can print the user-facing part of the kernel with no issues:
(cuda-gdb) x/3i _Z6vecAddPdS_S_i
0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>: MOV R1, c[0x0][0x28]
0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>: MOV R2, RZ
0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>: LDC.64 R2, c[0x0][R2+0x160]
I get some weird stretch of add instruction. (which is exactly the same for other kernels like vecDouble or vecHalf)
That is because focus has been switched to the host-side, from which you cannot dereference device-side pointers. Illustration below:
CUDA thread hit Breakpoint 1.2, breakpointKernel<<<(1,1,1),(1,1,1)>>> () at test.cu:55
55 printf("Breakpoint Kernel\n");
(cuda-gdb) x/3i _Z6vecAddPdS_S_i
0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>: MOV R1, c[0x0][0x28]
0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>: MOV R2, RZ
0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>: LDC.64 R2, c[0x0][R2+0x160]
(cuda-gdb) x/3i 0x7fffd9c5ed00
0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>: MOV R1, c[0x0][0x28]
0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>: MOV R2, RZ
0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>: LDC.64 R2, c[0x0][R2+0x160]
Now, let's switch to the host:
(cuda-gdb) thread 1
[Switching to thread 1 (Thread 0x7ffff7e91000 (LWP 2002484))]
#0 breakpointFunction () at test.cu:59
59 printf("Breakpoint Function\n");
(cuda-gdb) x/3i 0x7fffd9c5ed00
0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>: add %al,(%rax)
0x7fffd9c5ed02 <_Z6vecAddPdS_S_i+2>: add %al,(%rax)
0x7fffd9c5ed04 <_Z6vecAddPdS_S_i+4>: add %al,(%rax)
If we switch back to device, you will be able to dereference them:
(cuda-gdb) cuda thread (0)
[Switching focus to CUDA kernel 0, grid 7, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
55 printf("Breakpoint Kernel\n");
(cuda-gdb) x/3i 0x7fffd9c5ed00
0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>: MOV R1, c[0x0][0x28]
0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>: MOV R2, RZ
0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>: LDC.64 R2, c[0x0][R2+0x160
In x86_64, add %al,(%rax)
is represented by 0x0000
, so you are just dumping a bunch of zeroes. This is because the CUDA driver maps an empty page on the host. You can verify this in the debugger:
(cuda-gdb) thread 1
[Switching to thread 1 (Thread 0x7ffff7e91000 (LWP 2002484))]
#0 breakpointFunction () at test.cu:59
59 printf("Breakpoint Function\n");
(cuda-gdb) info proc mappings
process 2002484
Mapped address spaces:
Start Addr End Addr Size Offset Perms objfile
...
0x7fffd9577000 0x7fffda000000 0xa89000 0x0 ---p
...