This code takes on average 1.2 seconds to execute on a 10.1GB file, this is unacceptable, this should imo take 200ms, I have stuff to do I'm not waiting a day for this to execute:
#include <iostream>
#include <cstdio>
#include <chrono> // chrono related code removed
#include <immintrin.h>
#define CHUNK_SIZE (384 * 1024)
using namespace std;
inline int count_avx2(const char* begin, const char* end, char target) {
const __m256i avx2_Target = _mm256_set1_epi8(target);
const char* ptr = begin;
int result = 0;
for (; ptr + 63 < end; ptr += 64) {
_mm_prefetch(ptr + 128, _MM_HINT_T0);
__m256i chunk1 = _mm256_load_si256((__m256i*)(ptr));
__m256i chunk2 = _mm256_load_si256((__m256i*)(ptr + 32));
__m256i cmp_result1 = _mm256_cmpeq_epi8(chunk1, avx2_Target);
__m256i cmp_result2 = _mm256_cmpeq_epi8(chunk2, avx2_Target);
int mask1 = _mm256_movemask_epi8(cmp_result1);
int mask2 = _mm256_movemask_epi8(cmp_result2);
result += _mm_popcnt_u32(mask1);
result += _mm_popcnt_u32(mask2);
}
return result;
}
int counter(const string& filename) {
FILE* file = fopen(filename.c_str(), "rb");
if (!file) {
cerr << "Error: Couldn't open file " << filename << endl;
return -1;
}
char buffer[CHUNK_SIZE];
int totalNewlines = 0;
size_t bytesRead;
while (!feof(file)) {
bytesRead = fread(buffer, 1, sizeof(buffer), file);
totalNewlines += count_avx2(buffer, buffer + bytesRead, '\n');
}
fclose(file);
return totalNewlines;
}
int main() {
string filename = "output.txt";
int newlineCount = counter(filename);
if (newlineCount >= 0) {
cout << "Number of newlines in " << filename << ": " << newlineCount << endl;
} else {
cerr << "Error counting newlines in file." << endl;
return 1;
}
return 0;
}
Any opinions on how to make this faster?
I tried using fstream, memory mapping and Windows's built in io.h neither of these was faster than 1 second or satisfied my requirements, the count algo is fine, it takes ~100ms cumulative
By optimizing the count_avx2 by bitshifting the masks and using popcnt_u64 I was able to get it around ~1s, It's not optimal but its better I guess