I have a very small piece of MPI code in Fortran 90 that I found online to test my cluster with heterogeneous nodes. The code likes like this program MPI
include "mpif.h"
integer :: myrank,size,ierr
integer :: resultlen
character (len=8) :: name
call MPI_Init(ierr)
call MPI_Comm_rank(MPI_COMM_WORLD,myrank,ierr)
call MPI_Comm_size(MPI_COMM_WORLD,size,ierr)
call mpi_get_processor_name(name, resultlen, ierr)
write(*,*) "Processor ",myrank," of ",size,"on ",name,": Hello World!"
call MPI_Finalize(ierr)
end program
It was successfully compiled with the default mpif90 from the mpich package on CentOS 7.
> which mpif90
/usr/lib64/mpich/bin/mpif90
> mpif90 hello_mpi.f90 -o hello_mpi.exe
This code ran perfectly fine on one node.
> mpirun -host node1 -np 2 ./hello_mpi.exe
Processor 0 of 2 on node1: Hello World!
Processor 1 of 2 on node1: Hello World!
>
But on the other node, it crashed every time. These two nodes are very similar in hardware (both AMD Opteron processors, 64-core vs 48-core, and memory size are different), but identical on the software side (both CentOS7 with exactly the same packages). Here is what I got when running the code on node2
> mpirun -host node2 -np 2 ./hello_mpi.exe
Processor 0 of 2 on node2: Hello World!
Processor 1 of 2 on node2: Hello World!
*** stack smashing detected ***: ./hello_mpi.exe terminated
======= Backtrace: =========
*** stack smashing detected ***: ./hello_mpi.exe terminated
/lib64/libc.so.6(__fortify_fail+0x37)[0x7fc61aee3597]
/lib64/libc.so.6(__fortify_fail+0x0)[0x7fc61aee3560]
./hello_mpi.exe[0x400e66]
./hello_mpi.exe[0x400bff]
======= Backtrace: =========
/lib64/libc.so.6(__libc_start_main+0xf5)[0x7fc61adf6b15]
./hello_mpi.exe[0x400c31]
======= Memory map: ========
/lib64/libc.so.6(__fortify_fail+0x37)[0x7f394c8c3597]
00400000-00402000 r-xp 00000000 00:2e 493967 /home/user/tmp/hello_mpi.exe
00601000-00602000 r--p 00001000 00:2e 493967 /home/user/tmp/hello_mpi.exe
00602000-00603000 rw-p 00002000 00:2e 493967 /home/user/tmp/hello_mpi.exe
01e96000-01eb7000 rw-p 00000000 00:00 0 [heap]
7fc61a395000-7fc61a3a1000 r-xp 00000000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7fc61a3a1000-7fc61a5a0000 ---p 0000c000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7fc61a5a0000-7fc61a5a1000 r--p 0000b000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7fc61a5a1000-7fc61a5a2000 rw-p 0000c000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7fc61a5a2000-7fc61a5a8000 rw-p 00000000 00:00 0
7fc61add5000-7fc61af8c000 r-xp 00000000 fd:00 201382897 /usr/lib64/libc-2.17.so
7fc61af8c000-7fc61b18c000 ---p 001b7000 fd:00 201382897 /u/lib64/libc.so.6(__fortify_fail+0x0)[0x7f394c8c3560]
./hello_mpi.exe[0x400e66]
./hello_mpi.exe[0x400bff]
sr/lib64/libc-2.17.so
7fc61b18c000-7fc61b190000 r--p 001b7000 fd:00 201382897 /usr/lib64/libc-2.17.so
7fc61b190000-7fc61b192000 rw-p 001bb000 fd:00 201382897 /usr/lib64/libc-2.17.so
7fc61b192000-7fc61b197000 rw-p 00000000 00:00 0
7fc61b19d000-7fc61b1d8000 r-xp 00000000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7fc61b1d8000-7fc61b3d7000 ---p 0003b000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7fc61b3d7000-7fc61b3d8000 r--p 0003a000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7fc61b3d8000-7fc61b3d9000 rw-p 0003b000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7fc61b3dd000-7fc61b3f2000 r-xp 00000000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7fc61b3f2000-7fc61b5f1000 ---p 00015000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7fc61b5f1000-7fc61b5f2000 r--p 00014000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7fc61b5f2000-7fc61b5f3000 rw-p 00015000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7fc61b5f5000-7fc61b6f6000 r-xp 00000000 fd:00 201382905 /usr/lib64/libm-2.17.so
7fc61b6f6000-7fc61b8f5000 ---p 00101000 fd:00 201382905 /usr/lib64/libm-2.17.so
7fc61b8f5000-7fc61b8f6000 r--p 00100000 fd:00 201382905 /usr/lib64/libm-2.17.so
7fc61b8f6000-7fc61b8f7000 rw-p 00101000 fd:00 201382905 /usr/lib64/libm-2.17.so
7fc61b8fd000-7fc61ba1c000 r-xp 00000000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7fc61ba1c000-7fc61bc1c000 ---p 0011f000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7fc61bc1c000-7fc61bc1d000 r--p 0011f000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7fc61bc1d000-7fc61bc1f000 rw-p 00120000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7fc61bc25000-7fc61bc3b000 r-xp 00000000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
/lib64/libc.so.6(__libc_start_main+0xf5)[0x7f394c7d6b15]
./hello_mpi.exe[0x400c31]
======= Memory map: ========
7fc61bc3b000-7fc61be3b000 ---p 00016000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
7fc61be3b000-7fc61be3c000 r--p 00016000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
7fc61be3c000-7fc61be3d000 rw-p 00017000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
7fc61be3d000-7fc61be41000 rw-p 00000000 00:00 0
7fc61be45000-7fc61be4c000 r-xp 00000000 fd:00 201382510 /usr/lib64/librt-2.17.so
7fc61be4c000-7fc61c04b000 ---p 00007000 fd:00 201382510 /usr/lib64/librt-2.17.so
7fc61c04b000-7fc61c04c000 r--p 00006000 fd:00 201382510 /usr/lib64/librt-2.17.so
7fc61c04c000-7fc61c04d000 rw-p 00007000 fd:00 201382510 /usr/lib64/librt-2.17.so
7fc61c04d000-7fc61c052000 r-xp 00000000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7fc61c052000-7fc61c251000 ---p 00005000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7fc61c251000-7fc61c252000 r--p 00004000 fd:00 1362059600400000-00402000 r-xp 00000000 00:2e 493967 /home/user/tmp/hello_mpi.exe
00601000-00602000 r--p 00001000 00:2e 493967 /home/user/tmp/hello_mpi.exe
00602000-00603000 rw-p 00002000 00:2e 493967 /home/user/tmp/hello_mpi.exe
01ca0000-01cc1000 rw-p 00000000 00:00 0 [heap]
7f394bd75000-7f394bd81000 r-xp 00000000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7f394bd81000-7f394bf80000 ---p 0000c000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7f394bf80000-7f394bf81000 r--p 0000b000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7f394bf81000-7f394bf82000 rw-p 0000c000 fd:00 201382493 /usr/lib64/libnss_files-2.17.so
7f394bf82000-7f394bf88000 rw-p 00000000 00:00 0
7f394c7b5000-7f394c96c000 r-xp 00000000 fd:00 201382897 /usr/lib64/libc-2.17.so
7f394c96c000-7f394cb6c000 ---p 001b7000 fd:00 201382897 /u0 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7fc61c252000-7fc61c253000 rw-p 00005000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7fc61c255000-7fc61c256000 r-xp 00000000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7fc61c256000-7fc61c455000 ---p 00001000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7fc61c455000-7fc61c456000 r--p 00000000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7fc61c456000-7fc61c457000 rw-p 00001000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7fc61c45d000-7fc61c67e000 r-xp 00000000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7fc61c67e000-7fc61c87d000 ---p 00221000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7fc61c87d000-7fc61c88a000 r--p 00220000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7fc61c88a000-7fc61c88f000 rw-p 0022d000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7fc61c88f000-7fc61c8c8000 rw-p 00000000 00:00 0
7fc61c8cd000-7fc61c8cf000 r-xp 00000000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7fc61c8cf000-7fc61cace000 ---p 00002000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7fc61cace000-7fc61cacf000 r--p 00001000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7fc61cacf000-7fc61cad0000 rw-p 00002000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7fc61cad5000-7fc61caf6000 r-xp 00000000 fd:00 201382890 /usr/lib64/ld-2.17.so
7fc61cce1000-7fc61cce5000 rw-p 00000000 00:00 0
7fc61ccf5000-7fc61ccf6000 rw-p 00000000 00:00 0
7fc61ccf6000-7fc61ccf7000 r--p 00021000 fd:00 201382890 /usr/lib64/ld-2.17.so
7fc61ccf7000-7fc61ccf8000 rw-p 00022000 fd:00 201382890 /usr/lib64/ld-2.17.so
7fc61ccf8000-7fc61ccfa000 rw-p 00000000 00:00 0
7fc61ccfa000-7fc61ccfd000 rw-p 00000000 00:00sr/lib64/libc-2.17.so
7f394cb6c000-7f394cb70000 r--p 001b7000 fd:00 201382897 /usr/lib64/libc-2.17.so
7f394cb70000-7f394cb72000 rw-p 001bb000 fd:00 201382897 /usr/lib64/libc-2.17.so
7f394cb72000-7f394cb77000 rw-p 00000000 00:00 0
7f394cb7d000-7f394cbb8000 r-xp 00000000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7f394cbb8000-7f394cdb7000 ---p 0003b000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7f394cdb7000-7f394cdb8000 r--p 0003a000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7f394cdb8000-7f394cdb9000 rw-p 0003b000 fd:00 202328431 /usr/lib64/libquadmath.so.0.0.0
7f394cdbd000-7f394cdd2000 r-xp 00000000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7f394cdd2000-7f394cfd1000 ---p 00015000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7f394cfd1000-7f394cfd2000 r--p 00014000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
0
7ffcdf655000-7ffcdf676000 rw-p 00000000 00:00 0 [stack]
7ffcdf70d000-7ffcdf70f000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
7f394cfd2000-7f394cfd3000 rw-p 00015000 fd:00 201326729 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7f394cfd5000-7f394d0d6000 r-xp 00000000 fd:00 201382905 /usr/lib64/libm-2.17.so
7f394d0d6000-7f394d2d5000 ---p 00101000 fd:00 201382905 /usr/lib64/libm-2.17.so
7f394d2d5000-7f394d2d6000 r--p 00100000 fd:00 201382905 /usr/lib64/libm-2.17.so
7f394d2d6000-7f394d2d7000 rw-p 00101000 fd:00 201382905 /usr/lib64/libm-2.17.so
7f394d2dd000-7f394d3fc000 r-xp 00000000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7f394d3fc000-7f394d5fc000 ---p 0011f000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7f394d5fc000-7f394d5fd000 r--p 0011f000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7f394d5fd000-7f394d5ff000 rw-p 00120000 fd:00 202328443 /usr/lib64/libgfortran.so.3.0.0
7f394d605000-7f394d61b000 r-xp 00000000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
Program received signal SIGABRT: Process abort signal.
Backtrace for this error:
7f394d61b000-7f394d81b000 ---p 00016000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
7f394d81b000-7f394d81c000 r--p 00016000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
7f394d81c000-7f394d81d000 rw-p 00017000 fd:00 201382505 /usr/lib64/libpthread-2.17.so
7f394d81d000-7f394d821000 rw-p 00000000 00:00 0
7f394d825000-7f394d82c000 r-xp 00000000 fd:00 201382510 /usr/lib64/librt-2.17.so
7f394d82c000-7f394da2b000 ---p 00007000 fd:00 201382510 /usr/lib64/librt-2.17.so
7f394da2b000-7f394da2c000 r--p 00006000 fd:00 201382510 /usr/lib64/librt-2.17.so
7f394da2c000-7f394da2d000 rw-p 00007000 fd:00 201382510 /usr/lib64/librt-2.17.so
7f394da2d000-7f394da32000 r-xp 00000000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7f394da32000-7f394dc31000 ---p 00005000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7f394dc31000-7f394dc32000 r--p 00004000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7f394dc32000-7f394dc33000 rw-p 00005000 fd:00 136205960 /usr/lib64/mpich/lib/libmpl.so.1.0.0
7f394dc35000-7f394dc36000 r-xp 00000000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7f394dc36000-7f394de35000 ---p 00001000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7f394de35000-7f394de36000 r--p 00000000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7f394de36000-7f394de37000 rw-p 00001000 fd:00 136205962 /usr/lib64/mpich/lib/libopa.so.1.0.0
7f394de3d000-7f394e05e000 r-xp 00000000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7f394e05e000-7f394e25d000 ---p 00221000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7f394e25d000-7f394e26a000 r--p 00220000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7f394e26a000-7f394e26f000 rw-p 0022d000 fd:00 136205954 /usr/lib64/mpich/lib/libmpich.so.10.0.4
7f394e26f000-7f394e2a8000 rw-p 00000000 00:00 0
7f394e2ad000-7f394e2af000 r-xp 00000000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7f394e2af000-7f394e4ae000 ---p 00002000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7f394e4ae000-7f394e4af000 r--p 00001000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7f394e4af000-7f394e4b0000 rw-p 00002000 fd:00 136205958 /usr/lib64/mpich/lib/libmpichf90.so.10.0.4
7f394e4b5000-7f394e4d6000 r-xp 00000000 fd:00 201382890 /usr/lib64/ld-2.17.so
7f394e6ce000-7f394e6d6000 rw-p 00000000 00:00 0
7f394e6d6000-7f394e6d7000 r--p 00021000 fd:00 201382890 /usr/lib64/ld-2.17.so
7f394e6d7000-7f394e6d8000 rw-p 00022000 fd:00 201382890 /usr/lib64/ld-2.17.so
7f394e6d8000-7f394e6da000 rw-p 00000000 00:00 0
7fffabb37000-7fffabb58000 rw-p 00000000 00:00 0 [stack]
7fffabbed000-7fffabbef000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Program received signal SIGABRT: Process abort signal.
Backtrace for this error:
#0 0x7FC61B916467
#1 0x7FC61B916AAE
#2 0x7FC61AE0A66F
#3 0x7FC61AE0A5F7
#4 0x7FC61AE0BCE7
#0 0x7F394D2F6467
#1 0x#5 0x7FC61AE4A326
#6 0x7FC61AEE3596
#7 0x7FC61AEE355F
7F394D2F6AAE
#2 0x7F394C7EA66F
#3 0x7F394C7EA5F7
#4 0x7F394C7EBCE7
#5 0x7F394C82A326
#6 0x7F394C8C3596
#7 0x7F394C8C355F
#8 0x400E65 in mpi at hello_mpi.f90:16
#8 0x400E65 in mpi at hello_mpi.f90:16
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= EXIT CODE: 6
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Aborted (signal 6)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
So my question is, what could be causing this problem? The interesting thing is a similar piece of MPI code in C (shown below, obtained from the internet) worked just fine on both nodes without the above problem.
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
// Initialize the MPI environment
MPI_Init(NULL, NULL);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
// Print off a hello world message
printf("Hello world from processor %s, rank %d"
" out of %d processors\n",
processor_name, world_rank, world_size);
// Finalize the MPI environment.
MPI_Finalize();
}
Could this be hardware related? Or something else?
name
is too short. The MPI specification states:
The argument
name
must represent storage that is at leastMPI_MAX_PROCESSOR_NAME
characters long.MPI_GET_PROCESSOR_NAME
may write up to this many characters into name.
If that does not fix the problem, then your MPI implementation might be broken.