c++performancefile-read

C++ File reading speed unsatisfactory


This code takes on average 1.2 seconds to execute on a 10.1GB file, this is unacceptable, this should imo take 200ms, I have stuff to do I'm not waiting a day for this to execute:

#include <iostream>
#include <cstdio>
#include <chrono>  // chrono related code removed
#include <immintrin.h>

#define CHUNK_SIZE (384 * 1024)

using namespace std;

inline int count_avx2(const char* begin, const char* end, char target) {
    const __m256i avx2_Target = _mm256_set1_epi8(target);
    const char* ptr = begin;
    int result = 0;

    for (; ptr + 63 < end; ptr += 64) {
        _mm_prefetch(ptr + 128, _MM_HINT_T0);
        __m256i chunk1 = _mm256_load_si256((__m256i*)(ptr));
        __m256i chunk2 = _mm256_load_si256((__m256i*)(ptr + 32));
        __m256i cmp_result1 = _mm256_cmpeq_epi8(chunk1, avx2_Target);
        __m256i cmp_result2 = _mm256_cmpeq_epi8(chunk2, avx2_Target);
        int mask1 = _mm256_movemask_epi8(cmp_result1);
        int mask2 = _mm256_movemask_epi8(cmp_result2);
        result += _mm_popcnt_u32(mask1);
        result += _mm_popcnt_u32(mask2);
    }

    return result;
}

int counter(const string& filename) {
    FILE* file = fopen(filename.c_str(), "rb");
    if (!file) {
        cerr << "Error: Couldn't open file " << filename << endl;
        return -1;
    }

    char buffer[CHUNK_SIZE];
    int totalNewlines = 0;
    size_t bytesRead;

    while (!feof(file)) {
        bytesRead = fread(buffer, 1, sizeof(buffer), file);

        totalNewlines += count_avx2(buffer, buffer + bytesRead, '\n');
    }

    fclose(file);

    return totalNewlines;
}

int main() {
    string filename = "output.txt";
    int newlineCount = counter(filename);

    if (newlineCount >= 0) {
        cout << "Number of newlines in " << filename << ": " << newlineCount << endl;
    } else {
        cerr << "Error counting newlines in file." << endl;
        return 1;
    }

    return 0;
}

Any opinions on how to make this faster?

I tried using fstream, memory mapping and Windows's built in io.h neither of these was faster than 1 second or satisfied my requirements, the count algo is fine, it takes ~100ms cumulative

https://i.sstatic.net/OlzNj.png


Solution

  • By optimizing the count_avx2 by bitshifting the masks and using popcnt_u64 I was able to get it around ~1s, It's not optimal but its better I guess