I am trying to insert boost::dynamic_bitset<>
into boost::bimap
. However, it is very slow as compared to inserting integers or strings. Minimal Example is given, the code of which is shown below
// Example program
#include <iostream>
#include <string>
#include <boost/bimap.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
namespace std {
template <typename Block, typename Alloc> struct hash<boost::dynamic_bitset<Block, Alloc> > {
size_t operator()(boost::dynamic_bitset<Block, Alloc> const& bs) const {
size_t seed = boost::hash_value(bs.size());
std::vector<Block> blocks(bs.num_blocks());
boost::hash_range(seed, blocks.begin(), blocks.end());
return seed;
}
};
}
namespace bimaps = boost::bimaps;
typedef boost::dynamic_bitset<> Bitset;
typedef boost::bimap<
bimaps::unordered_set_of<Bitset, std::hash<Bitset>>,
bimaps::unordered_multiset_of<Bitset, std::hash<Bitset> > > bimap_reference;
typedef bimap_reference::value_type position;
bimap_reference reference_index_vector;
class bitsets{
public:
void encode_string(
std::string &sequence_content,
std::string &binary_sequence);
void encode_positions(
std::string &pos,
std::string &pos_binary_sequence);
};
int main()
{
bitsets b;
std::string binary_sequence, decoded_string ;
std::string pos_binary_sequence, decoded_positions;
std::string sequence_content = "ADGFGGFFAAACGFGCAAFGCAAFGCAFGCAAFGCAFGCAAGGGCAGDDDCGGAFFGCA";
for(size_t i = 0; i < sequence_content.size(); i++){
std::string substr = sequence_content.substr(i,15);
b.encode_string(substr, binary_sequence);
boost::dynamic_bitset<> bits = boost::dynamic_bitset<> (binary_sequence);
std::string pos = std::to_string(i);
b.encode_positions(pos, pos_binary_sequence);
boost::dynamic_bitset<> pos_bits = boost::dynamic_bitset<> (pos_binary_sequence);
reference_index_vector.insert(position(pos_bits, bits));
binary_sequence.clear();
pos_binary_sequence.clear();
i += 14;
}
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
}
void bitsets::encode_string(std::string &substr, std::string &binary_sequence){
for (size_t i = 0; i < substr.size(); ++i){
switch (substr[i]){
case 'A':
case 'a':
binary_sequence += "00";
break;
case 'C':
case 'c':
binary_sequence += "01";
break;
case 'D':
case 'd':
binary_sequence += "10";
break;
case 'G':
case 'g':
binary_sequence += "110";
break;
case 'F':
case 'f':
binary_sequence += "110";
break;
}
}
}
void bitsets::encode_positions(std::string &pos, std::string &pos_binary_sequence){
for(size_t i = 0; i < pos.size(); ++i){
switch (pos[i]){
case '0':
pos_binary_sequence += "1101";
break;
case '1':
pos_binary_sequence += "100";
break;
case '2':
pos_binary_sequence += "1110";
break;
case '3':
pos_binary_sequence += "1100";
break;
case '4':
pos_binary_sequence += "101";
break;
case '5':
pos_binary_sequence += "000";
break;
case '6':
pos_binary_sequence += "001";
break;
case '7':
pos_binary_sequence += "011";
break;
case '8':
pos_binary_sequence += "1111";
break;
case '9':
pos_binary_sequence += "010";
break;
}
}
}
For a string of 5 million characters long it takes a lot of time (approx 5 mins) intead of a few seconds for integers/strings to insert into the bimap
. Why boost::dynamic_bitset<>
is slow and how can I improve it.
I refactored a little, fixed the hash function (I think - pls check it) and ran the test for a sequence of 5,000,000 random thingumyjigs (aleles?)
With a release build, the test ran in 900 milliseconds - less than one second.
Here's the code. I make no claims about the correctness or efficiency of the algorithm. I suspect there is room for improvement.
// Example program
#include <iostream>
#include <string>
#include <boost/bimap.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
#include <random>
#include <chrono>
#include <thread>
namespace std {
template <typename Block, typename Alloc>
struct hash<boost::dynamic_bitset<Block, Alloc> > {
using bitset_type = boost::dynamic_bitset<Block, Alloc>;
using block_type = typename bitset_type::block_type ;
size_t operator()(boost::dynamic_bitset<Block, Alloc> const& bs) const
{
thread_local static std::vector<block_type> block_data;
auto blocks = bs.num_blocks();
block_data.assign(blocks, 0);
to_block_range(bs, block_data.begin());
return boost::hash<std::vector<block_type>>()(block_data);
}
};
}
namespace bimaps = boost::bimaps;
typedef boost::dynamic_bitset<> Bitset;
typedef boost::bimap<
bimaps::unordered_set_of<Bitset, std::hash<Bitset>>,
bimaps::unordered_multiset_of<Bitset, std::hash<Bitset> > > bimap_reference;
typedef bimap_reference::value_type position;
class bitsets{
public:
void encode_string(
std::string &sequence_content,
std::string &binary_sequence);
void encode_positions(
std::string &pos,
std::string &pos_binary_sequence);
};
bimap_reference test(const std::string& sequence_content)
{
bimap_reference reference_index_vector;
bitsets b;
std::string binary_sequence, decoded_string ;
std::string pos_binary_sequence, decoded_positions;
for(size_t i = 0; i < sequence_content.size(); i++){
std::string substr = sequence_content.substr(i,15);
b.encode_string(substr, binary_sequence);
boost::dynamic_bitset<> bits = boost::dynamic_bitset<> (binary_sequence);
std::string pos = std::to_string(i);
b.encode_positions(pos, pos_binary_sequence);
boost::dynamic_bitset<> pos_bits = boost::dynamic_bitset<> (pos_binary_sequence);
reference_index_vector.insert(position(pos_bits, bits));
binary_sequence.clear();
pos_binary_sequence.clear();
i += 14;
}
return reference_index_vector;
}
int main()
{
static const std::string valid_chars = "ADGF";
std::random_device rd;
auto eng = std::default_random_engine(rd());
auto dist = std::uniform_int_distribution<int>(0, valid_chars.size() - 1);
static const auto sequence_size = std::size_t(5'000'000);
auto sequence_content = std::string(sequence_size, ' ');
std::generate(sequence_content.begin(), sequence_content.end(), [&]{ return valid_chars[dist(eng)]; });
auto start_time = std::chrono::system_clock::now();
auto reference_index_vector = test(sequence_content);
auto end_time = std::chrono::system_clock::now();
std::cout
<< "test took: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count()
<< " milliseconds\n";
std::this_thread::sleep_for(std::chrono::seconds(5));
for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
iter != iend; ++iter ) {
std::cout << iter->left << " <--> "<< iter->right <<std::endl;
}
}
void bitsets::encode_string(std::string &substr, std::string &binary_sequence){
for (size_t i = 0; i < substr.size(); ++i){
switch (substr[i]){
case 'A':
case 'a':
binary_sequence += "00";
break;
case 'C':
case 'c':
binary_sequence += "01";
break;
case 'D':
case 'd':
binary_sequence += "10";
break;
case 'G':
case 'g':
binary_sequence += "110";
break;
case 'F':
case 'f':
binary_sequence += "110";
break;
}
}
}
void bitsets::encode_positions(std::string &pos, std::string &pos_binary_sequence){
for(size_t i = 0; i < pos.size(); ++i){
switch (pos[i]){
case '0':
pos_binary_sequence += "1101";
break;
case '1':
pos_binary_sequence += "100";
break;
case '2':
pos_binary_sequence += "1110";
break;
case '3':
pos_binary_sequence += "1100";
break;
case '4':
pos_binary_sequence += "101";
break;
case '5':
pos_binary_sequence += "000";
break;
case '6':
pos_binary_sequence += "001";
break;
case '7':
pos_binary_sequence += "011";
break;
case '8':
pos_binary_sequence += "1111";
break;
case '9':
pos_binary_sequence += "010";
break;
}
}
}