cudanvidiamemory-dumpcuda-gdb

Dump/inspect NVIDIA GPU global memory contents corresponding to arbitrary (but not invalid) addresses


I was wondering how to inspect or dump the NVIDIA GPU global memory contents corresponding to some arbitrary address but not invalid (not necessarily something returned by CUDA's memory management APIs), like some address pointing to some symbol through cuda-gdb may be.

Let us take a small example program to illustrate the situation:

/*********************************post.cu***********************************/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define CUDA_SAFECALL(call)                                                 \
    {                                                                       \
        call;                                                               \
        cudaError err = cudaGetLastError();                                 \
        if (cudaSuccess != err) {                                           \
            fprintf(                                                        \
                stderr,                                                     \
                "Cuda error in function '%s' file '%s' in line %i : %s.\n", \
                #call, __FILE__, __LINE__, cudaGetErrorString(err));        \
            fflush(stderr);                                                 \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    }

// CUDA kernel that adds two vectors, each thread handles one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) c[id] = a[id] + b[id];
}

// CUDA kernel that doubles the elements of a vector
__global__ void vecDouble(double *a, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) a[id] *= 2;
}

// CUDA kernel that halves the elements of a vector
__global__ void vecHalve(double *a, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) a[id] /= 2;
}

typedef void (*fp)(double *, double *, double *, int);
typedef void (*fp1)(double *, int);

__device__ fp kernelPtrvecAdd = vecAdd;
__device__ fp1 kernelPtrvecDouble = vecDouble;
__device__ fp1 kernelPtrvecHalve = vecHalve;

// Parent kernel that launches vecAdd dynamically
__global__ void parentKernel(fp kernelPtr, double *d_a, double *d_b, double *d_c, int n) {
    int blockSize = 1024;
    int gridSize = (int)ceil((float)n / blockSize);
    printf("Parent kernel:: kernelPtr: %p\n", kernelPtr);
    // Launch the vecAdd kernel dynamically from the device
    kernelPtr<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
}

__global__ void breakpointKernel(){
    printf("Breakpoint Kernel\n");
}

void breakpointFunction(){
    printf("Breakpoint Function\n");
}

int main(int argc, char *argv[]) {
    // Size of vectors
    int n = 10000;
    if (argc > 1) n = atoi(argv[1]);
    
    // Host input vectors
    double *h_a, *h_b, *h_c;

    // Device input vectors
    double *d_a, *d_b, *d_c;

    // Size in bytes of each vector
    size_t bytes = n * sizeof(double);

    // Allocate memory for each vector on host
    h_a = (double *)malloc(bytes);
    h_b = (double *)malloc(bytes);
    h_c = (double *)malloc(bytes);

    // Initialize vectors on host
    for (int i = 0; i < n; i++) {
        h_a[i] = sin(i) * sin(i);
        h_b[i] = cos(i) * cos(i);
        h_c[i] = 0;
    }

    fp h_kernelPtrvecAdd;
    fp1 h_kernelPtrvecDouble;
    fp1 h_kernelPtrvecHalve;
    CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecAdd, kernelPtrvecAdd, sizeof(fp)));
    CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecDouble, kernelPtrvecDouble, sizeof(fp1)));
    CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecHalve, kernelPtrvecHalve, sizeof(fp1)));

    printf("Device vecAdd Ptr: %p\n", h_kernelPtrvecAdd);
    printf("Host   vecAdd Ptr: %p\n", vecAdd);

    printf("Device vecDouble Ptr: %p\n", h_kernelPtrvecDouble);
    printf("Host   vecDouble Ptr: %p\n", vecDouble);

    printf("Device vecHalve Ptr: %p\n", h_kernelPtrvecHalve);
    printf("Host   vecHalve Ptr: %p\n", vecHalve);

    // Create CUDA stream
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    // Allocate memory for each vector on GPU using async memory allocation
    cudaMallocAsync(&d_a, bytes, stream);
    cudaMallocAsync(&d_b, bytes, stream);
    cudaMallocAsync(&d_c, bytes, stream);

    cudaStreamSynchronize(stream);

    printf("d_a: %p\n", d_a);
    printf("d_b: %p\n", d_b);
    printf("d_c: %p\n", d_c);

    // Copy host vectors to device using async memory copy
    cudaMemcpyAsync(d_a, h_a, bytes, cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_b, h_b, bytes, cudaMemcpyHostToDevice, stream);
    
    cudaStreamSynchronize(stream);

    // Launch parent kernel that launches vecAdd dynamically
    (parentKernel<<<1, 1, 0, stream>>>(h_kernelPtrvecAdd, d_a, d_b, d_c, n));

    int blockSize, gridSize;
    
    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n / blockSize);

    vecDouble<<<gridSize, blockSize, 0, stream>>>(d_a, n);
    vecDouble<<<gridSize, blockSize, 0, stream>>>(d_b, n);
    vecAdd<<<gridSize, blockSize, 0, stream>>>(d_a, d_b, d_c, n);
    vecHalve<<<gridSize, blockSize, 0, stream>>>(d_c, n);
    
    // Synchronize the stream to ensure everything is done
    cudaStreamSynchronize(stream);
    
    // Copy array back to host using async memory copy
    cudaMemcpyAsync(h_c, d_c, bytes, cudaMemcpyDeviceToHost, stream);
        
    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for (int i = 0; i < n; i++) sum += h_c[i];
    printf("Final sum = %f; sum/n = %f (should be ~1)\n", sum, sum / n);

    breakpointKernel<<<1, 1, 0, stream>>>();
    breakpointFunction();
    
    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);

    // Release device memory using async memory deallocation
    cudaFreeAsync(d_a, stream);
    cudaFreeAsync(d_b, stream);
    cudaFreeAsync(d_c, stream);


    cudaStreamDestroy(stream);
    return 0;
}



$ nvcc -g -G -o post post.cu -rdc=true

Now let's attach the program to cuda-gdb:

$ cuda-gdb ./post
(cuda-gdb) break breakpointKernel() 
Breakpoint 1 at 0xcf30: file post.cu, line 53.
(cuda-gdb) break breakpointFunction() 
Breakpoint 2 at 0xbe2f: file post.cu, line 58.
(cuda-gdb) run                        
Starting program: post 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff54c0000 (LWP 22501)]
[Detaching after fork from child process 22502]
[New Thread 0x7ffff4993000 (LWP 22511)]
[New Thread 0x7fffe89ff000 (LWP 22512)]
[New Thread 0x7fffe3fff000 (LWP 22513)]
Device vecAdd Ptr: 0x7fffceaff800
Host   vecAdd Ptr: 0x555555560957
Device vecDouble Ptr: 0x7fffceaff300
Host   vecDouble Ptr: 0x555555560ac7
Device vecHalve Ptr: 0x7fffceafee00
Host   vecHalve Ptr: 0x555555560c29
d_a: 0x302000000
d_b: 0x302013a00
d_c: 0x302027400
Parent kernel:: kernelPtr: 0x7fffceaff800
Final sum = 10000.000000; sum/n = 1.000000 (should be ~1)
[Switching focus to CUDA kernel 0, grid 8, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 1, lane 0]

Thread 1 "post" hit Breakpoint 1, breakpointKernel<<<(1,1,1),(1,1,1)>>> () at post.cu:54
54          printf("Breakpoint Kernel\n");

So, we are inside a kernel.

Trying to dump the memory contents of array a on the device (d_a):

(cuda-gdb) x /10x (@global void*) 0x302000000
0x302000000:    0x00000000      0x00000000      0x95d4dc81      0x3ff6a889
0x302000010:    0x036d9261      0x3ffa7553      0x687dd0a8      0x3fa4648f
0x302000020:    0xd7ec65f2      0x3ff253f7

We get some value. Now trying to access the memory content at the device pointer of vecAdd gives:

(cuda-gdb) x /10x (@global void*) 0x7fffceaff800
0x7fffceaff800: Error: Failed to read 4 bytes of global memory from 0x7fffceaff800
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).

(cuda-gdb) x /10bx (@global void*) 0x7fffceaff800
0x7fffceaff800: Error: Failed to read 1 bytes of global memory from 0x7fffceaff800
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).

(cuda-gdb) x /10i (@global void*) 0x7fffceaff800 
   0x7fffceaff800:
   0x7fffceaff801:
   0x7fffceaff802:
   0x7fffceaff803:
   0x7fffceaff804:
   0x7fffceaff805:
   0x7fffceaff806:
   0x7fffceaff807:
   0x7fffceaff808:
   0x7fffceaff809:

Let us continue and hit the breakpoint in the host function:

cuda-gdb) continue
Continuing.
Breakpoint Kernel

Thread 1 "post" hit Breakpoint 2, breakpointFunction () at post.cu:58
58          printf("Breakpoint Function\n");
(cuda-gdb) 

Now let us try to access the content at the host pointer of vecAdd:


(cuda-gdb) x /10i 0x555555560957                
   0x555555560957 <_Z6vecAddPdS_S_i>:   endbr64 
   0x55555556095b <_Z6vecAddPdS_S_i+4>: push   %rbp
   0x55555556095c <_Z6vecAddPdS_S_i+5>: mov    %rsp,%rbp
   0x55555556095f <_Z6vecAddPdS_S_i+8>: sub    $0x20,%rsp
   0x555555560963 <_Z6vecAddPdS_S_i+12>:        mov    %rdi,-0x8(%rbp)
   0x555555560967 <_Z6vecAddPdS_S_i+16>:        mov    %rsi,-0x10(%rbp)
   0x55555556096b <_Z6vecAddPdS_S_i+20>:        mov    %rdx,-0x18(%rbp)
   0x55555556096f <_Z6vecAddPdS_S_i+24>:        mov    %ecx,-0x1c(%rbp)
   0x555555560972 <_Z6vecAddPdS_S_i+27>:        mov    -0x1c(%rbp),%ecx
   0x555555560975 <_Z6vecAddPdS_S_i+30>:        mov    -0x18(%rbp),%rdx

The above are valid x86_64 instructions.

Now trying to get memory content of the device pointer from the host function breakpoint:

(cuda-gdb) x /10i (@global void*) 0x7fffceaff800
   0x7fffceaff800:      add    %al,(%rax)
   0x7fffceaff802:      add    %al,(%rax)
   0x7fffceaff804:      add    %al,(%rax)
   0x7fffceaff806:      add    %al,(%rax)
   0x7fffceaff808:      add    %al,(%rax)
   0x7fffceaff80a:      add    %al,(%rax)
   0x7fffceaff80c:      add    %al,(%rax)
   0x7fffceaff80e:      add    %al,(%rax)
   0x7fffceaff810:      add    %al,(%rax)
   0x7fffceaff812:      add    %al,(%rax)

I get some weird stretch of add instruction. (which is exactly the same for other kernels like vecDouble or vecHalf)

Trying out for vecDouble:


(cuda-gdb) x /10i (@global void*)  0x7fffceaff300        
   0x7fffceaff300:      add    %al,(%rax)
   0x7fffceaff302:      add    %al,(%rax)
   0x7fffceaff304:      add    %al,(%rax)
   0x7fffceaff306:      add    %al,(%rax)
   0x7fffceaff308:      add    %al,(%rax)
   0x7fffceaff30a:      add    %al,(%rax)
   0x7fffceaff30c:      add    %al,(%rax)
   0x7fffceaff30e:      add    %al,(%rax)
   0x7fffceaff310:      add    %al,(%rax)
   0x7fffceaff312:      add    %al,(%rax)

These weird add instructions seem erroneous to me anyway.

My question is, how could one dump the memory content of some arbitrary (but valid) device side pointer corresponding to some symbol or around that?


cuda-gdb version:

NVIDIA (R) CUDA Debugger
12.0 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 12.1

Solution

  • Now trying to access the memory content at the device pointer of vecAdd gives:

    (cuda-gdb) x /10x (@global void*) 0x7fffceaff800
    0x7fffceaff800: Error: Failed to read 4 bytes of global memory from > 0x7fffceaff800
    , error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
    

    The CUDA driver might inject code after loading user binaries, and debugging tools do not support printing it as of today. However. you can print the user-facing part of the kernel with no issues:

    (cuda-gdb) x/3i _Z6vecAddPdS_S_i
       0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>:   MOV R1, c[0x0][0x28]
       0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>:        MOV R2, RZ
       0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>:        LDC.64 R2, c[0x0][R2+0x160]
    

    I get some weird stretch of add instruction. (which is exactly the same for other kernels like vecDouble or vecHalf)

    That is because focus has been switched to the host-side, from which you cannot dereference device-side pointers. Illustration below:

    CUDA thread hit Breakpoint 1.2, breakpointKernel<<<(1,1,1),(1,1,1)>>> () at test.cu:55
    55          printf("Breakpoint Kernel\n");
    (cuda-gdb) x/3i _Z6vecAddPdS_S_i
       0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>:   MOV R1, c[0x0][0x28]
       0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>:        MOV R2, RZ
       0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>:        LDC.64 R2, c[0x0][R2+0x160]
    (cuda-gdb) x/3i 0x7fffd9c5ed00
       0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>:   MOV R1, c[0x0][0x28]
       0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>:        MOV R2, RZ
       0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>:        LDC.64 R2, c[0x0][R2+0x160]
    

    Now, let's switch to the host:

    (cuda-gdb) thread 1
    [Switching to thread 1 (Thread 0x7ffff7e91000 (LWP 2002484))]
    #0  breakpointFunction () at test.cu:59
    59          printf("Breakpoint Function\n");
    (cuda-gdb) x/3i 0x7fffd9c5ed00
       0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>:   add    %al,(%rax)
       0x7fffd9c5ed02 <_Z6vecAddPdS_S_i+2>: add    %al,(%rax)
       0x7fffd9c5ed04 <_Z6vecAddPdS_S_i+4>: add    %al,(%rax)
    

    If we switch back to device, you will be able to dereference them:

    (cuda-gdb) cuda thread (0)
    [Switching focus to CUDA kernel 0, grid 7, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
    55          printf("Breakpoint Kernel\n");
    (cuda-gdb) x/3i 0x7fffd9c5ed00
       0x7fffd9c5ed00 <_Z6vecAddPdS_S_i>:   MOV R1, c[0x0][0x28]
       0x7fffd9c5ed10 <_Z6vecAddPdS_S_i+16>:        MOV R2, RZ
       0x7fffd9c5ed20 <_Z6vecAddPdS_S_i+32>:        LDC.64 R2, c[0x0][R2+0x160
    

    In x86_64, add %al,(%rax) is represented by 0x0000, so you are just dumping a bunch of zeroes. This is because the CUDA driver maps an empty page on the host. You can verify this in the debugger:

    (cuda-gdb) thread 1
    [Switching to thread 1 (Thread 0x7ffff7e91000 (LWP 2002484))]
    #0  breakpointFunction () at test.cu:59
    59          printf("Breakpoint Function\n");
    (cuda-gdb) info proc mappings
    process 2002484
    Mapped address spaces:
    
              Start Addr           End Addr       Size     Offset  Perms  objfile
    ...
          0x7fffd9577000     0x7fffda000000   0xa89000        0x0  ---p
    ...