pythonmpi

ValueError: mismatch in send count 2 and receive count 8


I've this MPI_example.py file to solve a matrix:

from mpi4py import MPI
import numpy as np
from scipy.stats import ortho_group
from scipy.sparse import spdiags


def generate_matrix(dim):
    a = ortho_group.rvs(dim, random_state=0)
    b = np.linspace(1., 10., dim)
    return a @ spdiags(b, 0, dim, dim) @ a.T


def power_iteration(A, b, num_iters):
    for _ in range(num_iters):
        b_new = A @ b
        b_new_norm = np.linalg.norm(b_new)
        b = b_new / b_new_norm
    return b


comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()


n = 8  
num_iters = 100


if rank == 0:
    A = generate_matrix(n)
else:
    A = None


rows_per_proc = n // size
local_A = np.zeros((rows_per_proc, n))
comm.Scatter(A, local_A, root=0)


if rank == 0:
    b = np.ones(n)
else:
    b = np.zeros(n)

comm.Bcast(b, root=0)


for _ in range(num_iters):
    local_b_new = local_A @ b
    global_b_new = np.zeros(n)
    comm.Allreduce(local_b_new, global_b_new)
    norm = np.linalg.norm(global_b_new)
    b = global_b_new / norm


if rank == 0:
    estimated_eigenvalue = np.dot(b.T, A @ b) / np.dot(b.T, b)
    print(f"Eigen vector estimated: {estimated_eigenvalue}")
    print(f"Error: {abs(10 - estimated_eigenvalue)}")

import time
start_time = time.time()

end_time = time.time()
if rank == 0:
    print(f"Time: {end_time - start_time} [s]")

Im running it in Google Colab, so Im trying this:

from google.colab import files
uploaded = files.upload() #select MPI_example.py
!ls #check .py file
!pip install mpi4py
import mpi4py
print(mpi4py.__version__)

And then I run all the code using:

!mpirun --allow-run-as-root --oversubscribe -n 4 python MPI_example.py

But Im getting this error:

ValueError: mismatch in send count 2 and receive count 8
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[21624,1],2]
  Exit code:    1

I want to try differents n in !mpirun --allow-run-as-root --oversubscribe -n 4 python MPI_example.py. How can I fix this error? and whats wrong ?


Solution

  • Replacing Allreduce by Allgather, I get a reproducible result independent of the number of parallel processes (after increasing n to support a higher number of MPI processes):

    #    comm.Allreduce(local_b_new, global_b_new)
        comm.Allgather(local_b_new, global_b_new)
    

    Allreduce performs the default op=SUM on all elements of the vector and distributes the result to all processes. local_b_new only contains the local partition of b, so element-wise sum is not the right operation. Gather collects all vectors, concatenates them and distributes the result to all processes.