I'm learning to use scalaPACK least squares pdgels
routine but cannot get it to work.
I would appreciate your insight on this matter.
The code below suppose to distribute 8x8
matrix A
to 4 local 4x4
matrices A_loc
and similarly for vector B
. The blacs grid
is 2x2. The code is intended to be run with 4 MPI tasks.
The code compiles and runs with the following output with one MPI task:
BEFORE: Local B size 8 on rank: 0 values: 1 2 3 4 5 6 7 8
AFTER: Local B size 8 on rank: 0 values: 0.960918 0.0252256 0.0222086 0.0319953 -0.0471391 0.0205009 -0.0163878 0.00267855
This looks about right, the B_loc
has been updated with solution from least squares.
However when four MPI tasks are used I'm getting the following result:
BEFORE: Local B size 4 on rank: 0 values: 1 2 3 4
BEFORE: Local B size 4 on rank: 1 values: 4.68603e-310 0 2.122e-314 2.122e-314
BEFORE: Local B size 4 on rank: 2 values: 5 6 7 8
BEFORE: Local B size 4 on rank: 3 values: 4.67566e-310 0 2.122e-314 2.122e-314
AFTER: Local B size 4 on rank: 1 values: 4.68603e-310 0 2.122e-314 2.122e-314
AFTER: Local B size 4 on rank: 2 values: 5 6 7 8
AFTER: Local B size 4 on rank: 3 values: 4.67566e-310 0 2.122e-314 2.122e-314
AFTER: Local B size 4 on rank: 0 values: 0.321957 1.12458 -0.21503 -0.231507
What surprises me is that only the values of the B_loc
on rank 0
have been updated with presumably part of the least squares solution. What am I doing wrong?
Here is the code:
#include <stdio.h>
#include <mpi.h>
#include <math.h>
extern "C" void blacs_get_(int*, int*, int*);
extern "C" void blacs_pinfo_(int*, int*);
extern "C" void blacs_gridinit_(int*, char*, int*, int*);
extern "C" void blacs_gridinfo_(int*, int*, int*, int*, int*);
extern "C" void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*);
extern "C" void blacs_gridexit_(int*);
extern "C" int numroc_(int*, int*, int*, int*, int*);
extern "C" void pdgels_(char* trans, int* m, int* n, int* nrhs, double* a, int* ia, int* ja, int* desca, double* b, int* ib, int* jb, int* descb, double* work, int* lwork, int* info);
extern "C" void pdgemr2d_(int *m, int *n, double *a, int *ia, int *ja, int *desca,
double *b, int *ib, int *jb, int *descb, int *context);
int main() {
MPI_Init(NULL, NULL);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int b_nrows, b_ncols, b_row, b_col, myid;
blacs_pinfo_( &myid, &size );
char layout = 'R';
b_nrows = sqrt(size);
b_ncols = sqrt(size);
int context;
int izero=0;
int ione=1;
blacs_get_( &izero, &izero, &context );
blacs_gridinit_( &context, &layout, &b_nrows, &b_ncols );
blacs_gridinfo_( &context, &b_nrows, &b_ncols, &b_row, &b_col );
int m=8; // rows of the global array
int n=8; // cols
int mb=m;
int nb=n;
int mb_loc=m/b_nrows;
int nb_loc=n/b_ncols;
// size of local arrays
int m_loc = numroc_( &m, &mb_loc, &b_row, &izero, &b_nrows );
int n_loc = numroc_( &n, &nb_loc, &b_col, &izero, &b_ncols );
// global
double *A = new double[m*n] ;
double *B = new double[m] ;
// local
double *A_loc = new double[m_loc*n_loc] ;
double *B_loc = new double[m_loc] ;
int descA[9], descB[9], descA_loc[9], descB_loc[9];
int info;
descinit_( descA, &m, &n, &mb, &nb, &izero, &izero, &context, &m, &info);
descinit_( descB, &m, &ione, &mb, &ione, &izero, &izero, &context, &m, &info);
descinit_( descA_loc, &m, &n, &mb_loc, &nb_loc, &izero, &izero, &context, &m_loc, &info);
descinit_( descB_loc, &m, &ione, &mb_loc, &ione, &izero, &izero, &context, &m_loc, &info);
// generate column major order matrix
for (int i = 0; i < m*n; i++) {
A[i] = i+1;
}
// and vector of tagets
for (int i = 0; i < m; i++) {
B[i] = (i+1);
}
// distribute matrices
pdgemr2d_(&m, &n, A, &ione, &ione, descA,
A_loc, &ione, &ione, descA_loc, &context);
pdgemr2d_(&m, &ione, B, &ione, &ione, descB,
B_loc, &ione, &ione, descB_loc, &context);
std::cout << "BEFORE: Local B size " << m_loc << " on rank: " << rank << " values: ";
for (int r = 0; r < m_loc; r++) {
std::cout << B_loc[r] << " ";
}
std::cout << std::endl;
char trans='N';
int nrhs = 1;
int ia=1;
int ja=1;
int ib=1;
int jb=1;
int lwork=-1;
double wkopt;
pdgels_(&trans, &m_loc, &n_loc, &nrhs, A_loc, &ia, &ja, descA_loc,
B_loc, &ib, &jb, descB_loc, &wkopt, &lwork, &info);
lwork = (int)wkopt;
double *work= new double[lwork];
pdgels_(&trans, &m_loc, &n_loc, &nrhs, A_loc, &ia, &ja, descA_loc,
B_loc, &ib, &jb, descB_loc, work, &lwork, &info);
std::cout << "AFTER: Local B size " << m_loc << " on rank: " << rank << " values: ";
for (int r = 0; r < m_loc; r++) {
std::cout << B_loc[r] << " ";
}
std::cout << std::endl;
delete [] work, A , B, A_loc, B_loc;
blacs_gridexit_(&context);
MPI_Finalize();
return 0;
}
After a little bit of digging (in fact a lot) I manged to identify the issue.
The documentation for pdgels
mention that 2nd and 3rd argument are dimensions of a GLOBAL matrix A.
https://netlib.org/scalapack/explore-html/dc/d96/pdgels_8f_source.html
However, I've been using IBM documentation with the assumption that it is pretty much the same. It is not. IBM uses dimensions of the local matrix (not global).
Therefore the simple solution is to change the following code
pdgels_(&trans, &m_loc, &n_loc, &nrhs, A_loc, &ia, &ja, descA_loc,
B_loc, &ib, &jb, descB_loc, &wkopt, &lwork, &info);
to
pdgels_(&trans, &m, &n, &nrhs, A_loc, &ia, &ja, descA_loc,
B_loc, &ib, &jb, descB_loc, &wkopt, &lwork, &info);
Anyway, I hope someone will find it useful.