c++boost-dynamic-bitsetboost-bimap

Insert boost::dynamic_bitset<> into boost::bimap


I am trying to insert boost::dynamic_bitset<> into boost::bimap. However, it is very slow as compared to inserting integers or strings. Minimal Example is given, the code of which is shown below

// Example program
#include <iostream>
#include <string>
#include <boost/bimap.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>


namespace std {
    template <typename Block, typename Alloc> struct hash<boost::dynamic_bitset<Block, Alloc> > {
        size_t operator()(boost::dynamic_bitset<Block, Alloc> const& bs) const {
            size_t seed = boost::hash_value(bs.size());

            std::vector<Block> blocks(bs.num_blocks());
            boost::hash_range(seed, blocks.begin(), blocks.end());

            return seed;
        }
    };
}


namespace bimaps = boost::bimaps;
    typedef boost::dynamic_bitset<> Bitset;
    typedef boost::bimap<
    bimaps::unordered_set_of<Bitset, std::hash<Bitset>>,
    bimaps::unordered_multiset_of<Bitset, std::hash<Bitset> > > bimap_reference;
    typedef bimap_reference::value_type position;
    bimap_reference reference_index_vector;


class bitsets{
public:
    void encode_string(
            std::string &sequence_content,
            std::string &binary_sequence);

    void encode_positions(
                std::string &pos,
                std::string &pos_binary_sequence);

};




int main()
{
    bitsets b;
    std::string binary_sequence, decoded_string ;
    std::string pos_binary_sequence, decoded_positions;
    std::string sequence_content = "ADGFGGFFAAACGFGCAAFGCAAFGCAFGCAAFGCAFGCAAGGGCAGDDDCGGAFFGCA";

    for(size_t i = 0; i < sequence_content.size(); i++){
        std::string substr = sequence_content.substr(i,15);
        b.encode_string(substr, binary_sequence);
        boost::dynamic_bitset<> bits = boost::dynamic_bitset<> (binary_sequence);
        std::string pos = std::to_string(i);
        b.encode_positions(pos, pos_binary_sequence);
        boost::dynamic_bitset<> pos_bits = boost::dynamic_bitset<> (pos_binary_sequence);
        reference_index_vector.insert(position(pos_bits, bits));
        binary_sequence.clear();
        pos_binary_sequence.clear();
        i += 14;
}
    for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
            iter != iend; ++iter ) {
        std::cout << iter->left << " <--> "<< iter->right <<std::endl;
    }
}



void bitsets::encode_string(std::string &substr, std::string &binary_sequence){
    for (size_t i = 0; i < substr.size(); ++i){
        switch (substr[i]){
        case 'A':
        case 'a':
            binary_sequence += "00";
            break;
        case 'C':
        case 'c':
            binary_sequence += "01";
            break;
        case 'D':
        case 'd':
            binary_sequence += "10";
            break;
        case 'G':
        case 'g':
            binary_sequence += "110";
            break;
        case 'F':
        case 'f':
            binary_sequence += "110";
            break;
        }
    }
}

void bitsets::encode_positions(std::string &pos, std::string &pos_binary_sequence){
    for(size_t i = 0; i < pos.size(); ++i){
        switch (pos[i]){
        case '0':
            pos_binary_sequence += "1101";
            break;
        case '1':
            pos_binary_sequence += "100";
            break;
        case '2':
            pos_binary_sequence += "1110";
            break;
        case '3':
            pos_binary_sequence += "1100";
            break;
        case '4':
            pos_binary_sequence += "101";
            break;
        case '5':
            pos_binary_sequence += "000";
            break;
        case '6':
            pos_binary_sequence += "001";
            break;
        case '7':
            pos_binary_sequence += "011";
            break;
        case '8':
            pos_binary_sequence += "1111";
            break;
        case '9':
            pos_binary_sequence += "010";
            break;
        }
    }
}

For a string of 5 million characters long it takes a lot of time (approx 5 mins) intead of a few seconds for integers/strings to insert into the bimap. Why boost::dynamic_bitset<> is slow and how can I improve it.


Solution

  • I refactored a little, fixed the hash function (I think - pls check it) and ran the test for a sequence of 5,000,000 random thingumyjigs (aleles?)

    With a release build, the test ran in 900 milliseconds - less than one second.

    Here's the code. I make no claims about the correctness or efficiency of the algorithm. I suspect there is room for improvement.

    // Example program
    #include <iostream>
    #include <string>
    #include <boost/bimap.hpp>
    #include <boost/dynamic_bitset.hpp>
    #include <boost/bimap/unordered_set_of.hpp>
    #include <boost/bimap/unordered_multiset_of.hpp>
    #include <random>
    #include <chrono>
    #include <thread>
    
    namespace std {
        template <typename Block, typename Alloc>
        struct hash<boost::dynamic_bitset<Block, Alloc> > {
    
            using bitset_type = boost::dynamic_bitset<Block, Alloc>;
            using block_type = typename bitset_type::block_type ;
    
            size_t operator()(boost::dynamic_bitset<Block, Alloc> const& bs) const
            {
                thread_local static std::vector<block_type> block_data;
                auto blocks = bs.num_blocks();
                block_data.assign(blocks, 0);
                to_block_range(bs, block_data.begin());
                return boost::hash<std::vector<block_type>>()(block_data);
            }
        };
    }
    
    
    
    namespace bimaps = boost::bimaps;
    typedef boost::dynamic_bitset<> Bitset;
    typedef boost::bimap<
            bimaps::unordered_set_of<Bitset, std::hash<Bitset>>,
            bimaps::unordered_multiset_of<Bitset, std::hash<Bitset> > > bimap_reference;
    typedef bimap_reference::value_type position;
    
    
    class bitsets{
    public:
        void encode_string(
                std::string &sequence_content,
                std::string &binary_sequence);
    
        void encode_positions(
                std::string &pos,
                std::string &pos_binary_sequence);
    
    };
    
    bimap_reference test(const std::string& sequence_content)
    {
        bimap_reference reference_index_vector;
    
        bitsets b;
        std::string binary_sequence, decoded_string ;
        std::string pos_binary_sequence, decoded_positions;
    
        for(size_t i = 0; i < sequence_content.size(); i++){
            std::string substr = sequence_content.substr(i,15);
            b.encode_string(substr, binary_sequence);
            boost::dynamic_bitset<> bits = boost::dynamic_bitset<> (binary_sequence);
            std::string pos = std::to_string(i);
            b.encode_positions(pos, pos_binary_sequence);
            boost::dynamic_bitset<> pos_bits = boost::dynamic_bitset<> (pos_binary_sequence);
            reference_index_vector.insert(position(pos_bits, bits));
            binary_sequence.clear();
            pos_binary_sequence.clear();
            i += 14;
        }
    
        return reference_index_vector;
    }
    
    
    int main()
    {
        static const std::string valid_chars = "ADGF";
        std::random_device rd;
        auto eng = std::default_random_engine(rd());
        auto dist = std::uniform_int_distribution<int>(0, valid_chars.size() - 1);
    
        static const auto sequence_size = std::size_t(5'000'000);
    
        auto sequence_content = std::string(sequence_size, ' ');
        std::generate(sequence_content.begin(), sequence_content.end(), [&]{ return valid_chars[dist(eng)]; });
    
        auto start_time = std::chrono::system_clock::now();
        auto reference_index_vector = test(sequence_content);
    
        auto end_time = std::chrono::system_clock::now();
    
        std::cout
                << "test took: "
                << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count()
                << " milliseconds\n";
        std::this_thread::sleep_for(std::chrono::seconds(5));
    
    
        for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
             iter != iend; ++iter ) {
            std::cout << iter->left << " <--> "<< iter->right <<std::endl;
        }
    }
    
    
    
    void bitsets::encode_string(std::string &substr, std::string &binary_sequence){
        for (size_t i = 0; i < substr.size(); ++i){
            switch (substr[i]){
                case 'A':
                case 'a':
                    binary_sequence += "00";
                    break;
                case 'C':
                case 'c':
                    binary_sequence += "01";
                    break;
                case 'D':
                case 'd':
                    binary_sequence += "10";
                    break;
                case 'G':
                case 'g':
                    binary_sequence += "110";
                    break;
                case 'F':
                case 'f':
                    binary_sequence += "110";
                    break;
            }
        }
    }
    
    void bitsets::encode_positions(std::string &pos, std::string &pos_binary_sequence){
        for(size_t i = 0; i < pos.size(); ++i){
            switch (pos[i]){
                case '0':
                    pos_binary_sequence += "1101";
                    break;
                case '1':
                    pos_binary_sequence += "100";
                    break;
                case '2':
                    pos_binary_sequence += "1110";
                    break;
                case '3':
                    pos_binary_sequence += "1100";
                    break;
                case '4':
                    pos_binary_sequence += "101";
                    break;
                case '5':
                    pos_binary_sequence += "000";
                    break;
                case '6':
                    pos_binary_sequence += "001";
                    break;
                case '7':
                    pos_binary_sequence += "011";
                    break;
                case '8':
                    pos_binary_sequence += "1111";
                    break;
                case '9':
                    pos_binary_sequence += "010";
                    break;
            }
        }
    }