Boost MultiIndex Container, when defined to have hashed_non_unique
keys, can group equivalent keys together and return them all against an equal_range
query, as mentioned here. But I see no way of querying the largest range (or n largest ranges) in a set. Without comparing between the range sizes of distinct hashes, which can become computationally very expensive, is there a way to query the largest equal ranges?
If we consider a simple example, such as this one, I would like to query by frequency and get Tom
as the first result, and then Jack
and Leo
in no particular order.
Ok, if you're using non-unique hashed indices, turns out equal_range
does not invoke equality comparison for all the elements in the returned range (unlike common implementations of std::unordered_multimap
, BTW), so the following can be very efficient:
template<typename HashIndex>
std::multimap<
std::size_t,
std::reference_wrapper<const typename HashIndex::value_type>,
std::greater<std::size_t>
> group_sizes(const HashIndex& i)
{
decltype(group_sizes(i)) res;
for(auto it=i.begin(),end=i.end();it!=end;){
auto next=i.equal_range(*it).second;
res.emplace((std::size_t)std::distance(it,next),*it);
it=next;
}
return res;
}
To check how efficient this actually is, let's try instrumenting the element type:
#include <boost/multi_index_container.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/identity.hpp>
#include <cstring>
#include <functional>
#include <iostream>
#include <string>
#include <tuple>
#include <map>
template<typename HashIndex>
std::multimap<
std::size_t,
std::reference_wrapper<const typename HashIndex::value_type>,
std::greater<std::size_t>
> group_sizes(const HashIndex& i)
{
decltype(group_sizes(i)) res;
for(auto it=i.begin(),end=i.end();it!=end;){
auto next=i.equal_range(*it).second;
res.emplace((std::size_t)std::distance(it,next),*it);
it=next;
}
return res;
}
struct instrumented_string:std::string
{
using std::string::string;
static void reset_nums()
{
num_hashes=0;
num_eqs=0;
}
static std::size_t num_hashes,num_eqs;
};
std::size_t instrumented_string::num_hashes=0;
std::size_t instrumented_string::num_eqs=0;
bool operator==(const instrumented_string& x,const instrumented_string& y)
{
++instrumented_string::num_eqs;
return static_cast<std::string>(x)==y;
}
std::size_t hash_value(const instrumented_string& x)
{
++instrumented_string::num_hashes;
return boost::hash<std::string>{}(x);
}
using namespace boost::multi_index;
using container=multi_index_container<
instrumented_string,
indexed_by<
hashed_non_unique<identity<instrumented_string>>
>
>;
int main()
{
auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
container c;
for(auto& v:values){
for(auto i=100*std::strlen(v);i--;)c.insert(v);
}
instrumented_string::reset_nums();
auto gs=group_sizes(c);
for(const auto& g:gs){
std::cout<<g.first<<": "<<g.second.get()<<"\n";
}
std::cout<<"# hashes: "<<instrumented_string::num_hashes<<"\n";
std::cout<<"# eqs: "<<instrumented_string::num_eqs<<"\n";
}
Output
800: Subhamoy
600: Bjarne
400: Jack
300: Tom
300: Leo
# hashes: 5
# eqs: 5
So, for a container with 2,400 elements, invoking group_sizes
has resulted in just 5 hash calculations and 5 equality comparisons (plus ~2,400 iterator increments, of course).
If you really want to get rid of hashes, the following can do:
#include <boost/multi_index_container.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/identity.hpp>
#include <cstring>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <map>
template<typename HashIndex>
struct internal_reference
{
const HashIndex& i;
const typename HashIndex::value_type& r;
std::size_t buc;
};
template<typename HashIndex>
struct internal_reference_equal_to
{
bool operator()(
const typename HashIndex::value_type& x,
const internal_reference<HashIndex>& y)const
{
return
std::addressof(x)==std::addressof(y.r)||
y.i.key_eq()(y.i.key_extractor()(x),y.i.key_extractor()(y.r));
}
bool operator()(
const internal_reference<HashIndex>& x,
const typename HashIndex::value_type& y)const
{
return (*this)(y,x);
}
};
template<typename HashIndex>
struct internal_reference_hash
{
std::size_t operator()(const internal_reference<HashIndex>& x)const
{
return x.buc;
}
};
template<typename HashIndex>
std::multimap<
std::size_t,
std::reference_wrapper<const typename HashIndex::value_type>,
std::greater<std::size_t>
> group_sizes(const HashIndex& i)
{
decltype(group_sizes(i)) res;
for(std::size_t buc=0,buc_count=i.bucket_count();buc<buc_count;++buc){
for(auto it=i.begin(buc),end=i.end(buc);it!=end;){
auto p=i.equal_range(
internal_reference<HashIndex>{i,*it,buc},
internal_reference_hash<HashIndex>{},
internal_reference_equal_to<HashIndex>{});
std::size_t dist=0;
auto next=it;
while(p.first!=p.second){
++p.first;
++dist;
++next;
}
res.emplace(dist,*it);
it=next;
}
}
return res;
}
struct instrumented_string:std::string
{
using std::string::string;
static void reset_nums()
{
num_hashes=0;
num_eqs=0;
}
static std::size_t num_hashes,num_eqs;
};
std::size_t instrumented_string::num_hashes=0;
std::size_t instrumented_string::num_eqs=0;
bool operator==(const instrumented_string& x,const instrumented_string& y)
{
++instrumented_string::num_eqs;
return static_cast<std::string>(x)==y;
}
std::size_t hash_value(const instrumented_string& x)
{
++instrumented_string::num_hashes;
return boost::hash<std::string>{}(x);
}
using namespace boost::multi_index;
using container=multi_index_container<
instrumented_string,
indexed_by<
hashed_non_unique<identity<instrumented_string>>
>
>;
int main()
{
auto values={"Tom","Jack","Leo","Bjarne","Subhamoy"};
container c;
for(auto& v:values){
for(auto i=100*std::strlen(v);i--;)c.insert(v);
}
instrumented_string::reset_nums();
auto gs=group_sizes(c);
for(const auto& g:gs){
std::cout<<g.first<<": "<<g.second.get()<<"\n";
}
std::cout<<"# hashes: "<<instrumented_string::num_hashes<<"\n";
std::cout<<"# eqs: "<<instrumented_string::num_eqs<<"\n";
}
Output
800: Subhamoy
600: Bjarne
400: Jack
300: Tom
300: Leo
# hashes: 0
# eqs: 0
But please bear in mind this version of group_sizes
exploits the undocumented fact that elements with hash value h
get placed in the bucket h%bucket_count()
(or, put another way, internal_reference<HashIndex>
hashing is technically not a conformant compatible extension of the index hash function).