pythonperformancetime-complexityspace-complexity

Improve computational time and memory usage of the calculation of a large Matrix that has four loops


I want to calculate a Matrix G that its elements is a scalar and are calculated as:

enter image description here

I want to calculated this matrix for a large n > 10000, d>30. My code is below but it has a huge overhead and it still takes very long time.

How can I make this computation at the fastest possible way? Without using GPU and Minimize the memory usage.

import numpy as np
from sklearn.gaussian_process.kernels import Matern
from tqdm import tqdm
from joblib import Parallel, delayed

# Pre-flattened computation to minimize data transfer overhead
def precompute_differences(R, Z):
    n, d        = R.shape
    R_diff_flat = (R[:, None, :] - R[None, :, :]).reshape(n * n, d)
    Z_diff      = Z[:, None, :] - Z[None, :, :]
    return R_diff_flat, Z_diff

def compute_G_row(i, R_diff_flat, Z_diff, W, gamma_val, kernel, n, d):
    """
    Compute the i-th row for j >= i and store them in a temporary array.
    """
    row_values = np.zeros(n)
    for j in range(i, n):
        Z_ij   = gamma_val * Z_diff[i, j].reshape(1, d)
        K_flat = kernel(R_diff_flat, Z_ij)
        K_ij   = K_flat.reshape(n, n)
        row_values[j] = np.sum(W * K_ij)
    return i, row_values

def compute_G(M, gamma, R, Z, nu=1.5, length_scale=1.0, use_parallel=True):
    """
    Compute the G matrix with fewer kernel evaluations by exploiting symmetry:
    G[i,j] = G[j,i]. We only compute for j >= i, then mirror the result.
    """
    R = np.asarray(R)
    Z = np.asarray(Z)
    M = np.asarray(M).reshape(-1, 1)  # ensure (n,1)
    n, d = R.shape

    # Precompute data
    R_diff_flat, Z_diff = precompute_differences(R, Z)
    W = M @ M.T  # Weight matrix

    G = np.zeros((n, n))

    kernel = Matern(length_scale=length_scale, nu=nu)

    if use_parallel and n > 1:
        # Parallel computation
        results = Parallel(n_jobs=-1)(
            delayed(compute_G_row)(i, R_diff_flat, Z_diff, W, gamma, kernel, n, d)
            for i in tqdm(range(n), desc="Computing G matrix")
        )
    else:
        # Single-threaded computation
        results = []
        for i in tqdm(range(n), desc="Computing G matrix"):
            row_values = np.zeros(n)
            for j in range(i, n):
                Z_ij   = gamma * Z_diff[i, j].reshape(1, d)
                K_flat = kernel(R_diff_flat, Z_ij)
                K_ij   = K_flat.reshape(n, n)
                row_values[j] = np.sum(W * K_ij)
            results.append((i, row_values))

    # Sort and fill final G by symmetry
    results.sort(key=lambda x: x[0])
    for i, row_vals in results:
        for j in range(i, n):
            G[i, j] = row_vals[j]
            G[j, i] = row_vals[j]  # mirror for symmetry

    # Delete auxiliary variables to save memory
    del R_diff_flat, Z_diff, W, kernel, results

    # Optional checks
    is_symmetric = np.allclose(G, G.T, atol=1e-8)
    eigenvalues = np.linalg.eigvalsh(G)
    is_semi_positive_definite = np.all(eigenvalues >= -1e-8)
    print(f"G is semi-positive definite: {is_semi_positive_definite}")
    print(f"G is symmetric: {is_symmetric}")

    # Delete all local auxiliary variables except G to save memory
    local_vars = list(locals().keys())
    for var_name in local_vars:
        if var_name not in ["G"]:
            del locals()[var_name]

    return G

Toy Example

# Example usage:
if __name__ == "__main__":
    __spec__ = None
    n = 20
    d = 10
    gamma = 0.9
    R = np.random.rand(n, d)
    Z = np.random.rand(n, d)
    M = np.random.rand(n, 1)

    G = compute_G(M, gamma, R, Z, nu=1.5, length_scale=1.0, use_parallel=True)
    print("G computed with shape:", G.shape)

Solution

  • A convenient way is to note that each entry could also be written as : enter image description here

    with above notation the computation could be much easier and:

    import numpy as np
    from tqdm import tqdm
    from sklearn.gaussian_process.kernels import Matern
    from yaspin import yaspin
    import time
    from memory_profiler import profile
    ##-----------------
    @profile
    def G_einsum_block(M, gamma, R, Z, nu=1.5, length_scale=1.0, block_size=100):
        n, d = R.shape
        G = np.zeros((n, n))
        Gamma = M.ravel()  # Ensure shape is (n,)
    
        # Initialize the Matern kernel
        kernel = Matern(length_scale=length_scale, nu=nu)
    
        # with yaspin(text="Computing Matrix G", spinner="dots") as spinner:
            # Iterate over chunks of ell
        for ell_start in tqdm(range(0, n, block_size), desc="Computing G by ell-Chunks"):
            ell_end = min(ell_start + block_size, n)
            ell_indices = np.arange(ell_start, ell_end)
            Gamma_ell = Gamma[ell_indices]
    
            # Compute shifted points for current ell chunk
            # Shape: (n, block_size, d)
            X_ell = gamma * Z[:, np.newaxis, :] + R[ell_indices]
    
            # Iterate over chunks of m
            for m_start in range(0, n, block_size):
                m_end = min(m_start + block_size, n)
                m_indices = np.arange(m_start, m_end)
                Gamma_m = Gamma[m_indices]
    
                # Compute shifted points for current m chunk
                # Shape: (n, block_size, d)
                X_m = gamma * Z[:, np.newaxis, :] + R[m_indices]
    
                # Reshape for kernel computation
                # Each pair (i, ell) and (j, m) needs to be compared
                # We compute pairwise distances between X_ell and X_m
                # To vectorize, reshape to (n * block_size, d)
                X_i_ell = X_ell.reshape(n * (ell_end - ell_start), d)
                X_j_m = X_m.reshape(n * (m_end - m_start), d)
    
                # Compute the kernel matrix for the current chunks
                # Shape: (n * block_size, n * block_size)
                K_chunk = kernel(X_i_ell, X_j_m)
    
                # Reshape K_chunk to (n, ell_chunk, n, m_chunk)
                K_chunk = K_chunk.reshape(n, ell_end - ell_start, n, m_end - m_start)
    
                # Multiply by M for current chunks
                # Shape: (ell_chunk,) and (m_chunk,)
                # Use broadcasting in einsum
                # 'iljm,l,m->ij' corresponds to:
                # i: row index of G
                # j: column index of G
                # l: current ell chunk
                # m: current m chunk
                G += np.einsum('iljm,l,m->ij', K_chunk, Gamma_ell, Gamma_m)
    
            # spinner.ok("✔")
    
        print("")
    
        # Optional checks
        is_symmetric = np.allclose(G, G.T, atol=1e-8)
        eigenvalues = np.linalg.eigvalsh(G)
        is_semi_positive_definite = np.all(eigenvalues >= -1e-8)
        print(f"G is semi-positive definite: {is_semi_positive_definite}")
        print(f"G is symmetric: {is_symmetric}")
    
        return G
    
    #%%
    ##--------------------------------------------------------------
    ### --- Example usage --- ####
    if __name__ == "__main__":
        # Example dimensions
        n = 20
        d = 10
        gamma = 0.9
    
        # Generate dummy data
        R = np.random.rand(n, d)
        Z = np.random.rand(n, d)
        M = np.random.rand(n, 1)
    
        # Compute G with a progress bar
        G = G_einsum_block(M, gamma, R, Z, nu=1.5, length_scale=1.0)
        print("Shape of G:", G.shape)