optimizationfortranopenacc

Unexpected Data Transfer in Fortran OpenACC: Debugging transfer$r Copies


I have the following fortran openacc code for batched dgemm and was trying to instead make only a single allocation for the workspace:

! nvfortran -acc -Minfo=all  -cuda -gpu=lineinfo  -cpp  -g -O0 -cudalib=cublas main.f90
module bcast_types
  use iso_c_binding
  use cudafor
  use cublas
  implicit none

  type DGEMM_BATCHED_STATE
    type(cublasHandle)                        :: handle
    integer                                   :: batch_count
    type(c_devptr), allocatable, dimension(:) :: devptr_A, devptr_B, devptr_C
  end type DGEMM_BATCHED_STATE
end module

module bcast_module
  use iso_c_binding, ONLY: c_devptr
  use cudafor
  use cublas
  use bcast_types
  IMPLICIT NONE

  interface broadcast_array
    MODULE PROCEDURE broadcast_array_2D
    MODULE PROCEDURE broadcast_array_3D
  end interface

  contains
    subroutine broadcast_array_2D(state, array, devptr)
      TYPE(c_devptr), dimension(:), intent(in) :: devptr
      type(DGEMM_BATCHED_STATE), intent(in) :: state
      REAL(8), DIMENSION(:,:), INTENT(IN) :: array
      INTEGER :: i, batch_size
      !$acc declare deviceptr(array,devptr)
      #if defined(__CUDA)
      attributes(DEVICE) :: array, devptr
      #endif
      batch_size = state%batch_count
      !$acc parallel loop default(present) vector_length(1) num_gangs(1)
      DO i = 1, batch_size
        devptr(i)  = transfer(loc(array(1, 1)), devptr(i))
      END DO
      !$acc end parallel loop
    end subroutine broadcast_array_2D

    subroutine broadcast_array_3D(state, array, devptr)
        TYPE(c_devptr), dimension(:), intent(in):: devptr
        type(DGEMM_BATCHED_STATE), intent(in) :: state
        REAL(8), DIMENSION(:,:,:), INTENT(IN) :: array
        INTEGER :: i
        !$acc declare deviceptr(array, devptr)
        #if defined(__CUDA)
        attributes(DEVICE) :: array, devptr
        #endif

        !$acc parallel loop default(present) vector_length(1) num_gangs(1)
        do i = 1, state%batch_count
          devptr(i) = transfer(loc(array(1, 1, i)), devptr(i))
        end do
        !$acc end parallel loop
    end subroutine broadcast_array_3D
end module bcast_module

module bcast_mul
  use cudafor
  use cublas
  use openacc
  use bcast_module
  use bcast_types
  implicit none

  INTERFACE DGEMM_BATCHED_BROADCAST
    module PROCEDURE bcast_dgemm_2D_3D
    module PROCEDURE bcast_dgemm_3D_2D
    module PROCEDURE bcast_dgemm_2D_2D
    module PROCEDURE bcast_dgemm_3D_3D
  END INTERFACE

  contains
  subroutine init_dgemm_batched_broadcast(state, batch_count)
    implicit none
    type(DGEMM_BATCHED_STATE), intent(inout) :: state
    integer :: stat, batch_count, test_bcnt, i
    state%batch_count = batch_count
    
    stat = cublasCreate(state%handle)
    if (stat /= CUBLAS_STATUS_SUCCESS) then
      print *, "CUBLAS initialization error: ", stat
      stop
    end if
    !$acc enter data create(state            )
    !$acc enter data create(state%batch_count)

    allocate(state%devptr_A(batch_count))
    allocate(state%devptr_B(batch_count))
    allocate(state%devptr_C(batch_count))

    !$acc enter data create(state%devptr_A)
    !$acc enter data create(state%devptr_B)
    !$acc enter data create(state%devptr_C)
    !$acc update device(state%batch_count)
  end subroutine init_dgemm_batched_broadcast

  subroutine destroy_dgemm_batched_state(state)
    use cudafor
    use cublas
    use openacc
    implicit none
    type(DGEMM_BATCHED_STATE), intent(in) :: state
    integer:: stat

    stat = cublasDestroy(state%handle)
    if (stat /= CUBLAS_STATUS_SUCCESS) then
      print *, "CUBLAS shutdown error: ", stat
    end if
    !$acc exit data delete(state%devptr_A)
    !$acc exit data delete(state%devptr_B)
    !$acc exit data delete(state%devptr_C)
    !$acc exit data delete(state%batch_count)
    !$acc exit data delete(state)
    deallocate(state%devptr_A)
    deallocate(state%devptr_B)
    deallocate(state%devptr_C)
  end subroutine

  subroutine bcast_dgemm_3D_2D(state,transA, transB, m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC, batch_count)
    use cudafor
    use cublas
    use iso_c_binding
    implicit none
    
    integer, intent(in) :: ldA,ldB, ldC, m,n,k, batch_count
    real(8), dimension(:,:,:) :: A
    real(8), dimension(:,:) :: B 
    real(8), dimension(:,:,:) :: C
    !$acc declare deviceptr(A,B,C)
    type(c_devptr), dimension(batch_count), device :: devptr_A, devptr_B, devptr_C


    real(8), intent(in) :: alpha, beta
    character(len=1), intent(in) :: transA, transB
    #if defined(__CUDA)
    attributes(DEVICE) :: A,B,C
    #endif

    integer :: stat, i
    integer :: cublasTransA, cublasTransB
    type(DGEMM_BATCHED_STATE), intent(in) :: state


    if (transA == 'N') then
      cublasTransA = CUBLAS_OP_N
    else if (transA == 'T') then
      cublasTransA = CUBLAS_OP_T
    else
      print *, "Invalid transA value"
      stop
    end if
  
    if (transB == 'N') then
      cublasTransB = CUBLAS_OP_N
    else if (transB == 'T') then
      cublasTransB = CUBLAS_OP_T
    else
      print *, "Invalid transB value"
      stop
    end if

    !$acc host_data use_device(state%devptr_A, state%devptr_B, state%devptr_C)
    CALL broadcast_array(state, A, state%devptr_A)
    CALL broadcast_array(state, B, state%devptr_B)
    CALL broadcast_array(state, C, state%devptr_C)

    devptr_A = state%devptr_A
    devptr_B = state%devptr_B
    devptr_C = state%devptr_C
    !$acc end host_data

    !$acc host_data use_device(state%devptr_A, state%devptr_B, state%devptr_C)
    stat = cublasDgemmBatched(        &
          state%handle,               &
          cublasTransA, cublasTransB, &
          m, n, k,                    &
          alpha,                      &
          devptr_A, ldA,              &
          devptr_B, ldB,              &
          beta,                       &
          devptr_C, ldC,              &
          batch_count)
    !$acc end host_data

    if (stat /= CUBLAS_STATUS_SUCCESS) then
      print *, "CUBLAS error: ", stat
    end if
  
  end subroutine bcast_dgemm_3D_2D

end module bcast_mul

program main
  use cudafor
  use cublas
  use iso_c_binding
  use bcast_mul
  implicit none
    integer, parameter :: mdim = 8, batch_count = 1024, iter_count = 10
    real(8), dimension(mdim,mdim,batch_count) :: A, C
    real(8), dimension(mdim,mdim) :: B
    type(c_devptr), dimension(batch_count) :: devptr_A, devptr_B, devptr_C
    real, dimension(iter_count) :: times
    real, dimension(iter_count) :: times_2

    type(DGEMM_BATCHED_STATE) :: state
  
    integer :: stat, i, j, k, iter
    real(8) :: alpha, beta, idx, mysum
    type(cublasHandle) :: handle
    character(len=128) :: argv
    real :: clock_start, clock_end
    integer, parameter :: dp = kind(0.d0)

    write (*,'(A,I15)'),    'Matrix dim:         ', mdim
    write (*,'(A,I15)'),    'Batch count:        ', batch_count
    
    call init_dgemm_batched_broadcast(state,batch_count)

    do iter=1,iter_count
  
      ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2
      ! Matrices are arranged column major
      do k=1,batch_count
        do j=1,mdim
          do i=1,mdim
            if (i==j) then
              idx = real(j*mdim + i)
              A(i,j,k) = k*sin(idx)
              B(i,j)   = sin(idx)
              C(i,j,k) = k*cos(idx) * cos(idx)
            else
              A(i,j,k) = 0.0
              B(i,j)   = 0.0
              C(i,j,k) = 0.0
            endif
          enddo ! i
        enddo ! j
      enddo ! k
  
    alpha = 1.0
    beta = 1.0
    !$acc data copy (A,B,C)

    call cpu_time(clock_start)

    !$acc host_data use_device(A,B,C) 
    call dgemm_batched_broadcast(state,'N','N', mdim, mdim, mdim, alpha, A, mdim, B, mdim, beta, C, mdim, batch_count)
    !$acc end host_data 
    !$acc end data

    stat = cudaDeviceSynchronize()
    call cpu_time(clock_end)
    times(iter) = clock_end - clock_start
  
      ! Simple sanity test, mysum up all elements
      mysum = 0.0
      do k=1,batch_count
        do j=1,mdim
          do i=1,mdim
            mysum = mysum + C(i,j,k)
          enddo
        enddo
      enddo
      print *, ''
      write (*,'(A,I15)'),    'Iter:               ', iter
      write (*,'(A,F15.3)'),  'Sum is:             ', mysum
      write (*,'(A,F15.3)'),  'Should be:          ', float(mdim*(batch_count)*(batch_count+1)/2)
  
    enddo
  
    print *, ''
    write (*,'(A,ES15.3)'), 'Expect FLOP count:      ', real(batch_count * (2*mdim**3 + 3*mdim**2))
    write (*,'(A,ES15.3,ES11.3,ES11.3)'), 'Avg/min/max time   (s): ', &
        SUM(times)/SIZE(times), MINVAL(times), MAXVAL(times)
  
    stat = cublasDestroy(handle)
    call destroy_dgemm_batched_state(state)
  end program

when I run the code with export NV_ACC_NOTIFY=2 I get to see the following trace:

upload CUDA data  file=main.f90 function=main line=257 device=0 threadid=1 variable=descriptor bytes=224
upload CUDA data  file=main.f90 function=main line=257 device=0 threadid=1 variable=a(:,:,:) bytes=524288
upload CUDA data  file=main.f90 function=main line=257 device=0 threadid=1 variable=descriptor bytes=176
upload CUDA data  file=main.f90 function=main line=257 device=0 threadid=1 variable=b(:,:) bytes=512
upload CUDA data  file=main.f90 function=main line=257 device=0 threadid=1 variable=descriptor bytes=224
upload CUDA data  file=main.f90 function=main line=257 device=0 threadid=1 variable=c(:,:,:) bytes=524288
upload CUDA data  file=main.f90 function=broadcast_array_3d line=55 device=0 threadid=1 variable=transfer$r bytes=8
download CUDA data  file=main.f90 function=broadcast_array_3d line=59 device=0 threadid=1 variable=transfer$r bytes=8
upload CUDA data  file=main.f90 function=broadcast_array_2d line=38 device=0 threadid=1 variable=transfer$r bytes=8
download CUDA data  file=main.f90 function=broadcast_array_2d line=42 device=0 threadid=1 variable=transfer$r bytes=8
upload CUDA data  file=main.f90 function=broadcast_array_3d line=55 device=0 threadid=1 variable=transfer$r bytes=8
download CUDA data  file=main.f90 function=broadcast_array_3d line=59 device=0 threadid=1 variable=transfer$r bytes=8
download CUDA data  file=main.f90 function=main line=264 device=0 threadid=1 variable=c(:,:,:) bytes=524288
download CUDA data  file=main.f90 function=main line=264 device=0 threadid=1 variable=b(:,:) bytes=512
download CUDA data  file=main.f90 function=main line=264 device=0 threadid=1 variable=a(:,:,:) bytes=524288

I am confused by the copies of transfer$r since I cannot really pinpoint where they coming from. Also one peculiar behavior is that when I remove vector_length(1) num_gangs(1) the code doesnot work so I am not sure why but I dont really care about this but if someone can provide some insight that would be nice.

P.S. There are other overloads for the bcast_dgemm_* but I removed them for brevity since they are not used in this snippet.

I tried setting everything to being explicitly present present(state, state%batch_count, devptr,array) but offcourse that didnot help. I tried using acc_attach and acc_detach in the init_dgemm_batched_broadcast and destroy_dgemm_batched_state functions, but that didnot help either. Also the copies donot show in the compiler output:

NVFORTRAN-W-0194-INTENT(IN) argument cannot be defined - devptr (main.f90: 28)
  0 inform,   1 warnings,   0 severes, 0 fatal for broadcast_array_2d
NVFORTRAN-W-0194-INTENT(IN) argument cannot be defined - devptr (main.f90: 45)
  0 inform,   1 warnings,   0 severes, 0 fatal for broadcast_array_3d
NVFORTRAN-W-0155-MODULE PROCEDURE not defined: bcast_dgemm_2d_3d (main.f90: 197)
NVFORTRAN-W-0155-MODULE PROCEDURE not defined: bcast_dgemm_2d_2d (main.f90: 197)
NVFORTRAN-W-0155-MODULE PROCEDURE not defined: bcast_dgemm_3d_3d (main.f90: 197)
  0 inform,   3 warnings,   0 severes, 0 fatal for init_dgemm_batched_broadcast
broadcast_array_2d:
     38, Generating present(state%batch_count,state)
         Generating implicit firstprivate(batch_size)
         Generating NVIDIA GPU code
         39, !$acc loop seq
broadcast_array_3d:
     55, Generating NVIDIA GPU code
         56, !$acc loop seq
     55, Generating default present(state)
init_dgemm_batched_broadcast:
     90, Generating enter data create(state)
     91, Generating enter data create(state%batch_count)
     97, Generating enter data create(state%devptr_a(:))
     98, Generating enter data create(state%devptr_b(:))
     99, Generating enter data create(state%devptr_c(:))
    100, Generating update device(state%batch_count)
destroy_dgemm_batched_state:
    115, Generating exit data delete(state%devptr_a(:))
    116, Generating exit data delete(state%devptr_b(:))
    117, Generating exit data delete(state%devptr_c(:))
    118, Generating exit data delete(state%batch_count)
    119, Generating exit data delete(state)
main:
    249, Generating copy(a(:,:,:),c(:,:,:),b(:,:)) [if not already present]
    280, sum reduction inlined
         minval reduction inlined
         maxval reduction inlined

I am using nvfortran version:

nvfortran 23.11-0 64-bit target on x86-64 Linux -tp znver3
NVIDIA Compilers and Tools
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

Solution

  • "transfer" needs to store the result in a temp before it can be assigned and "transfer$r" is the base pointer to this temp result. Ideally the compiler could make this implicitly private, but can't here since it's a pointer being passed to a device subroutine. Hence it gets passed in as a shared variable thus causing the data transfers and prevents parallelism.

    I've not seen anyone use "transfer" in this way. Typically folks do a copy of the device pointers as illustrated in Greg Ruetsch's post on calling cuBlas batched routines from CUDA Fortran: https://developer.nvidia.com/blog/cuda-pro-tip-how-call-batched-cublas-routines-cuda-fortran/

    Greg's post is a bit old and you no longer need to define your own interfaces to the cuBlas routines, but hopefully you might find the core algorithm helpful.

    On a side note, CUDA Fortran implicitly defines "_CUDA" not "__CUDA" so the "device" attribute is not getting applied.