Why does blocking show no performance benefit in matrix multiplication

I have been playing around with Creel's video on optimising matrix multiplicationn and I don't get the speedups he does. What is the reason for this? Below is the program I used to benchmark. There are three functions: naive multiplication, in-place transpose of B, and in-place transpose of B + blocking. I ran this with n = 4000 and block sizes 1, 10, 20, 50, 100, 200. My caches are 32KB L1D, 256KB L2, 4MB L3 shared, so block size 10 should be 20 * 20 * 8 * 2 = 6.4KB, and fit comfortably in L1 cache. No matter the block size, it takes 50s, which is the same as for only transposing. I compiled with gcc -O3 -mavx2.

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

void matmul(size_t n, double A[n][n], double B[n][n], double result[n][n])
{
    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < n; j++) {
    
            double acc = 0;
    
            for (size_t k = 0; k < n; k++) {
                acc += A[i][k] * B[k][j];
            }
    
            result[i][j] = acc;
        }
    }
}

void transpose(size_t n, double matrix[n][n])
{
    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < i; j++) {
            double temp = matrix[i][j];
            matrix[i][j] = matrix[j][i];
            matrix[j][i] = temp;
        }
    }
}

void matmulTrans(size_t n, double A[n][n], double B[n][n], double result[n][n])
{
    transpose(n, B);

    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < n; j++) {
    
            double acc = 0;
    
            for (size_t k = 0; k < n; k++) {
                acc += A[i][k] * B[j][k];
            }
    
            result[i][j] = acc;
        }
    }
}

void matmulBlock(size_t n, double A[n][n], double B[n][n], 
        double result[n][n], size_t blockSize)
{
    transpose(n, B);

    for (size_t i = 0; i < n; i += blockSize) {
        for (size_t j = 0; j < n; j += blockSize) {
            for (size_t iBlock = i; iBlock < i + blockSize; iBlock++) {
                for (size_t jBlock = j; jBlock < j + blockSize; jBlock++) {
                    double acc = 0;
            
                    for (size_t k = 0; k < n; k++) {
                        acc += A[iBlock][k] * B[jBlock][k];
                    }
            
                    result[iBlock][jBlock] = acc;
                }
            }
        }
    }
}

int main(int argc, char **argv)
{
    if (argc != 3) {
        printf("Provide two arguments!\n");
        return 1;
    }

    int n = atoi(argv[1]);
    int blockSize = atoi(argv[2]);

    double (*A)[n] = malloc(n * n * sizeof(double));
    double (*B)[n] = malloc(n * n * sizeof(double));
    double (*result)[n] = malloc(n * n * sizeof(double));

    clock_t time1 = clock();
    matmulBlock(n, A, B, result, blockSize);
    clock_t time2 = clock();
//    matmul(n, A, B, result);
    clock_t time3 = clock();
    matmulTrans(n, A, B, result);
    clock_t time4 = clock();

    printf("Blocked version: %lfs.\nNaive version: %lfs.\n"
            "Transposed version: %lfs.\n", 
            (double) (time2 - time1) / CLOCKS_PER_SEC,
            (double) (time3 - time2) / CLOCKS_PER_SEC,
            (double) (time4 - time3) / CLOCKS_PER_SEC);

    free(A);
    free(B);
    free(result);

    return 0;
}

Solution

The problem is that I had only blocked the i and the j loop. This means that we essentially block A into an blockSize x 1 matrix of (n / blockSize) x n blocks and B into an 1 x blockSize matrix of n x (n / blockSize) blocks. These blocks are far too large to fit in cache. Using

void matmulBlock(size_t n, double A[n][n], double B[n][n],
        double result[__restrict__ n][n], size_t blockSize)
{
    for (size_t i = 0; i < n; i += blockSize) {
        for (size_t j = 0; j < n; j += blockSize) {
            for (size_t k = 0; k < n; k += blockSize) {
                for (size_t iBlock = i; iBlock < i + blockSize; iBlock++) {
                    for (size_t kBlock = k; kBlock < k + blockSize; kBlock++) {
                        for (size_t jBlock = j; jBlock < j + blockSize; jBlock++) {
                            result[iBlock][jBlock] += A[iBlock][kBlock] * B[kBlock][jBlock];
                        }
                    }
                }
            }
        }
    }
}

instead does lead to speedups.