I've boiled a long running function down to a "simple" series of matrix vector multiplications. The matrix does not change, but there are a a lot of vectors. I have put together a test program with the current state of the algorithm.
I've chased a few options for performance, but what is below is the best I have and it seems to work pretty well.
module maths
contains
subroutine lots_of_MVM(Y,P,t1,t2,startRow)
implicit none
! args
complex, intent(in), contiguous :: Y(:,:),P(:,:)
complex, intent(inout), contiguous :: t1(:,:),t2(:,:)
integer, intent(in) :: startRow
! locals
integer :: ii,jj,zz,nrhs,n,pCol,tRow,yCol
! indexing
nrhs = size(P,2)/2
n = size(Y,1)
! Do lots of maths
!$OMP PARALLEL PRIVATE(jj,pCol,tRow,yCol,zz)
!$OMP DO
do jj=1,nrhs
pCol = jj*2-1
tRow = startRow
do yCol=1,size(Y,2)
! This is faster than doing sum(P(:,pCol)*Y(:,yCol))
do zz=1,n
t1(tRow,jj) = t1(tRow,jj) + P(zz,pCol )*Y(zz,yCol)
t2(tRow,jj) = t2(tRow,jj) + P(zz,pCol+1)*Y(zz,yCol)
end do
tRow = tRow + 1
end do
end do
!$OMP END DO
!$OMP END PARALLEL
end subroutine
end module
program test
use maths
use omp_lib
implicit none
! variables
complex, allocatable,dimension(:,:) :: Y,P,t1,t2
integer :: n,nrhs,nY,yStart,yStop,mult
double precision startTime
! setup (change mult to make problem larger)
! real problem size mult = 1000 to 2000
mult = 300
n = 10*mult
nY = 30*mult
nrhs = 20*mult
yStart = 5
yStop = yStart + nrhs - 1
! allocate
allocate(Y(n,nY),P(n,nrhs*2))
allocate(t1(nrhs,nrhs),t2(nrhs,nrhs))
! make some data
call random_number(Y%re)
call random_number(Y%im)
call random_number(P%re)
call random_number(P%im)
t1 = 0
t2 = 0
! do maths
startTime = omp_get_wtime()
call lots_of_MVM(Y(:,yStart:yStop),P,t1,t2,1)
write(*,*) omp_get_wtime()-startTime
end program
Things I tried for performance (maybe incorrectly)
In addition to performance I would like better accuracy. More accuracy for similar performance would be acceptable. I tried a few things for this, but ended up with significantly slower performance.
Restrictions
Other info
Rewriting it using cgemm
/zgenn
increases the speed by a factor of about 4-10 using either ifort or gfortran with openblas. Here's the code I knocked together:
Module Precision_module
Use, Intrinsic :: iso_fortran_env, Only : wp => real64
! Use, Intrinsic :: iso_fortran_env, Only : wp => real32
Implicit None
Private
Public :: wp
End Module Precision_module
Module maths
! Use, Intrinsic :: iso_fortran_env, Only : wp => real64
Use Precision_module, Only : wp
Contains
Subroutine lots_of_MVM(Y,P,t1,t2,startRow)
Implicit None
! args
Complex( wp ), Intent(in), Contiguous :: Y(:,:),P(:,:)
Complex( wp ), Intent(inout), Contiguous :: t1(:,:),t2(:,:)
Integer, Intent(in) :: startRow
! locals
Integer :: jj,zz,nrhs,n,pCol,tRow,yCol
! indexing
nrhs = Size(P,2)/2
n = Size(Y,1)
! Do lots of maths
!$OMP PARALLEL PRIVATE(jj,pCol,tRow,yCol,zz)
!$OMP DO
Do jj=1,nrhs
pCol = jj*2-1
tRow = startRow
Do yCol=1,Size(Y,2)
! This is faster than doing sum(P(:,pCol)*Y(:,yCol))
Do zz=1,n
t1(tRow,jj) = t1(tRow,jj) + P(zz,pCol )*Y(zz,yCol)
t2(tRow,jj) = t2(tRow,jj) + P(zz,pCol+1)*Y(zz,yCol)
End Do
tRow = tRow + 1
End Do
End Do
!$OMP END DO
!$OMP END PARALLEL
End Subroutine lots_of_MVM
End Module maths
Program test
Use, Intrinsic :: iso_fortran_env, Only : numeric_storage_size, real32
Use Precision_module, Only : wp
Use maths
Use omp_lib, Only : omp_get_wtime, omp_get_max_threads
Implicit None
! variables
Complex( wp ), Allocatable,Dimension(:,:) :: Y,P,t1,t2
Integer :: n,nrhs,nY,yStart,yStop,mult
Real( wp ) :: startTime
Complex( wp ), Allocatable, Dimension( :, : ) :: t3, t4
Real( wp ) :: mem_reqd, mem_reqd_Gelements
Real( wp ) :: tloop, tblas
! setup (change mult to make problem larger)
! real problem size mult = 1000 to 2000
mult = 300
!mult = 50 ! for debug
n = 10*mult
nY = 30*mult
nrhs = 20*mult
yStart = 5
yStop = yStart + nrhs - 1
! allocate
Allocate(Y(n,nY),P(n,nrhs*2))
Allocate(t1(nrhs,nrhs),t2(nrhs,nrhs))
mem_reqd = Size( Y ) + Size( P ) + Size( t1 ) + Size( t2 )
mem_reqd_Gelements = mem_reqd / ( 1024.0_wp * 1024.0_wp * 1024.0_wp )
Write( *, * ) 'Mem reqd: ', mem_reqd_Gelements, ' Gelements'
! make some data
Call random_Number(Y%re)
Call random_Number(Y%im)
Call random_Number(P%re)
Call random_Number(P%im)
t1 = 0
t2 = 0
! do maths
Write( *, * ) 'Using ', omp_get_max_threads(), ' threads'
Write( *, * ) 'Using ', Merge( 'single', 'double', Kind( y ) == real32 ), ' precision'
startTime = Real( omp_get_wtime(), wp )
Call lots_of_MVM(Y(:,yStart:yStop),P,t1,t2,1)
tloop = Real( omp_get_wtime(), wp ) - startTime
Write(*,*) 'TLoop: ', tloop
Allocate( t3, mold = t1 )
Allocate( t4, mold = t2 )
t3 = 0.0_wp
t4 = 0.0_wp
startTime = Real( omp_get_wtime(), wp )
Call zgemm( 'T', 'N', nrhs, nrhs, n, ( 1.0_wp, 0.0_wp ), Y ( 1, ystart ), n , &
P ( 1, 1 ), 2 * n, &
( 1.0_wp, 0.0_wp ), t3 , nrhs )
Call zgemm( 'T', 'N', nrhs, nrhs, n, ( 1.0_wp, 0.0_wp ), Y ( 1, ystart ), n , &
P ( 1, 2 ), 2 * n, &
( 1.0_wp, 0.0_wp ), t4 , nrhs )
tblas = Real( omp_get_wtime(), wp ) - startTime
Write(*,*) 'TBlas: ', tblas
Write( *, * ) 'Time ratio ', tloop / tblas, ' ( big means blas better )'
Write( *, * ) "Max diff in t1 ", Maxval( Abs( t3 - t1 ) )
Write( *, * ) "Max diff in t2 ", Maxval( Abs( t4 - t2 ) )
End Program test
Note I have used double precision throughout as I know what to expect in terms of errors here. Results for ifort on 2 threads:
ijb@ijb-Latitude-5410:~/work/stack$ ifort -O3 -qopenmp mm.f90 -lopenblas
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
Mem reqd: 0.125728547573090 Gelements
Using 2 threads
Using double precision
TLoop: 71.4670290946960
TBlas: 17.8680319786072
Time ratio 3.99971464010481 ( big means blas better )
Max diff in t1 1.296029998720414E-011
Max diff in t2 1.273302296896508E-011
Results for gfortran:
ijb@ijb-Latitude-5410:~/work/stack$ gfortran-12 -fopenmp -O3 -Wall -Wextra -pedantic -Werror -std=f2018 mm.f90 -lopenblas
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
Mem reqd: 0.12572854757308960 Gelements
Using 2 threads
Using double precision
TLoop: 185.08875890000490
TBlas: 16.093782140000258
Time ratio 11.500637779852656 ( big means blas better )
Max diff in t1 1.2732928769591198E-011
Max diff in t2 1.3642443193628513E-011
Those differences are about what I would expect for double precision.
If the arguments to zgemm
are confusing take a look at BLAS LDB using DGEMM
Running in single precision (and changing the call to be to cgemm) has a comparable story, obviously with bigger differences, around 10^-3 to 10^-4, at least for gfortran. As I don't use single precision in my own work I have less of a feel for what to expect here, but this doesn't seem unreasonable:
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
Mem reqd: 0.125728548 Gelements
Using 2 threads
Using single precision
TLoop: 147.786453
TBlas: 8.18331814
Time ratio 18.0594788 ( big means blas better )
Max diff in t1 7.32427742E-03
Max diff in t2 6.83614612E-03
As for what precision you want and what you consider accurate, well you don't say so I can't really address that, save to say the simplest way is to move to double precision if you can take the memory hit - the use of zgemm will easily outweigh any performance hit you would take. But for performance it's the same story for any code - if you can rewrite it in terms of a matrix multiply then you will win.