Given an array of <key, value>
pairs, what is the state-of-the-art approach to group the keys together by values?
#include <vector>
#include <random>
#include <execution>
#include <unordered_map>
#include <iostream>
#include <tbb/concurrent_unordered_map.h>
#include <tbb/concurrent_vector.h>
#include <chrono>
template<typename timetype = std::chrono::microseconds>
struct tiktok
{
std::vector<std::chrono::time_point<std::chrono::steady_clock> > starts;
void reset() { starts.reserve(50); starts.resize(0); }
tiktok() { reset(); }
std::size_t tik() {
starts.emplace_back(std::chrono::steady_clock::now());
return 0;
}
std::size_t tok() {
std::size_t rst = std::chrono::duration_cast<timetype> (
std::chrono::steady_clock::now() - starts.back()).count();
starts.pop_back();
return rst;
}
};
int main()
{
int NuniqueString = 3e5;
std::vector<std::string > x(NuniqueString);
std::mt19937 rng(123);
for (auto& u: x) {
u = std::string(rng() % 1024 + 1, ' ');
char* c = &u[0];
for (int i = 0, iend = u.size(); i < iend; ++i)
c[i] = rng() % 256;
}
// The key-value pair definition.
struct Item { int key; std::string s; };
std::vector<Item> items(x.size() * 10);
for (int i = 0, iend = items.size(); i < iend; ++i) {
items[i].key = i;
items[i].s = x[rng() % NuniqueString];
}
auto itemsReserve = items;
// Measure time cost for grouping items' keys, using STL unordered_map
tiktok timer;
if constexpr (true) {
std::unordered_map<
std::string, std::vector<int> > H (items.size() * 1.3);
timer.tik();
std::for_each(items.begin(), items.end(), [&](auto& i)->void {
H[std::move(i.s)].push_back(i.key);
});
std::cout << "Sequential, use STL unordered_map time cost (ms) = " <<
timer.tok() << "\n\n";
}
// Measure time cost using tbb concurrent unordered_map and concurrent vector.
if constexpr (true) {
items = itemsReserve;
tbb::concurrent_unordered_map<
std::string, tbb::concurrent_vector<int>,
std::hash<std::string> > H(items.size() * 1.3);
timer.tik();
std::for_each( std::execution::par_unseq, items.begin(),
items.end(), [&](auto& i)->void {
auto it = H.insert(std::pair(
std::move(i.s),
tbb::concurrent_vector<int>()));
it.first->second.push_back(i.key);
});
std::cout << "Parallel, use tbb concurrent unordered_map and"
" concurrent vector time cost (ms) = " << timer.tok() << "\n\n";
}
}
g++ -std=c++20 groupStrings.cpp -Ofast -march=native -o test -ltbb
on a 16-core machine produces the following:
Sequential, use STL unordered_map time cost (ms) = 1700035
Parallel, use tbb concurrent unordered_map and concurrent vector time cost (ms) = 1575196
I need to frequently perform such groupings for massive arrays of key-value pairs. To start rolling my own treatment, is there any SOTA approach for solving it?
The problem is that the strings should not be moved into tbb::concurrent_unordered_map
:
#include <vector>
#include <random>
#include <execution>
#include <unordered_map>
#include <iostream>
#include <tbb/concurrent_unordered_map.h>
#include <tbb/concurrent_vector.h>
#include <chrono>
template<typename timetype = std::chrono::microseconds>
struct tiktok
{
std::vector<std::chrono::time_point<std::chrono::steady_clock> > starts;
void reset() { starts.reserve(50); starts.resize(0); }
tiktok() { reset(); }
std::size_t tik() {
starts.emplace_back(std::chrono::steady_clock::now());
return 0;
}
std::size_t tok() {
std::size_t rst = std::chrono::duration_cast<timetype> (
std::chrono::steady_clock::now() - starts.back()).count();
starts.pop_back();
return rst;
}
};
int main()
{
int NuniqueString = 3e5;
std::vector<std::string > x(NuniqueString);
std::mt19937 rng(123);
for (auto& u: x) {
u = std::string(rng() % 1024 + 1, ' ');
char* c = &u[0];
for (int i = 0, iend = u.size(); i < iend; ++i)
c[i] = rng() % 256;
}
// The key-value pair definition.
struct Item { int key; std::string s; };
std::vector<Item> items(x.size() * 10);
for (int i = 0, iend = items.size(); i < iend; ++i) {
items[i].key = i;
items[i].s = x[rng() % NuniqueString];
}
auto itemsReserve = items;
// Measure time cost for grouping items' keys, using STL unordered_map
tiktok timer;
if constexpr (true) {
std::unordered_map<
std::string, std::vector<int> > H (items.size() * 1.3);
timer.tik();
std::for_each(items.begin(), items.end(), [&](auto& i)->void {
H[std::move(i.s)].push_back(i.key);
});
std::cout << "Sequential, use STL unordered_map time cost (ms) = " <<
timer.tok() << "\n\n";
}
// Measure time cost using tbb concurrent unordered_map and concurrent vector.
if constexpr (true) {
items = itemsReserve;
tbb::concurrent_unordered_map<
std::string, tbb::concurrent_vector<int>,
std::hash<std::string> > H(items.size() * 1.3);
timer.tik();
std::for_each( std::execution::par_unseq, items.begin(),
items.end(), [&](auto& i)->void {
auto it = H.insert(std::pair(
i.s, // std::move(i.s),
tbb::concurrent_vector<int>()));
it.first->second.push_back(i.key);
});
std::cout << "Parallel, use tbb concurrent unordered_map and"
" concurrent vector time cost (ms) = " << timer.tok() << "\n\n";
}
}
This prints:
Sequential, use STL unordered_map time cost (ms) = 1691195
Parallel, use tbb concurrent unordered_map and concurrent vector time cost (ms) = 423323
4x speedup is not bad. But now the question is, why is moving the resource much slower than copying? My intuition is that, every call for std::move()
requires the thread write to the string
's header (a 24-byte block if I am right) for setting the pointers to nullptr
. Because these headers are contiguous in memory, the writes will constantly attack the same cache line (64-byte block). Cache coherence mechanism kicks in and slows it down. In short, false sharing https://en.wikipedia.org/wiki/False_sharing. I'd waste more time if had not missed std::move
by accident. Tricky tricky..
@David Eisenstat 's suggestion is great. parlay::group_by_key()
is very easy to use and is about 1.5x faster than the tbb::concurrent_unordered_map
approach. Highly recommend it.