c++group-byboost-multi-index

Equivalent of GROUP BY COUNT using boost multi-index


I'm using boost::multi_index_container to perform some operations on satellite data.

I have a data structure that looks like this:

struct SatData {
  std::string sat_system;
  std::string band;
  int index;
  uint64_t time;
  double data;

  SatData(const std::string& ss,
          const std::string& b,
          const int& i,
          const uint64_t& t,
          const double& d) : sat_system(ss), band(b), index(i), time(t), data(d) {}
};


using SatDataset =
multi_index_container <SatData, indexed_by
                        <                          
                          ordered_unique // given a specific satellite and a time, the measurement is unique
                          <
                            composite_key
                            <
                              SatData,
                              member<SatData, std::string, &SatData::sat_system>,
                              member<SatData, std::string, &SatData::band>,
                              member<SatData, int, &SatData::index>,
                              member<SatData, uint64_t, &SatData::time>
                            > // composite key
                          > // ordered_not_unique
                      
                        > // indexed_by
                      >; // multi_index_container

I would like to retrieve, for each sat_system+band+index, the number of measurements, so for example, if I had a table like this:

GPS     | L1  | 1 | 10
GPS     | L1  | 1 | 11
GPS     | L1  | 1 | 12
GPS     | L2  | 1 | 10
GPS     | L2  | 1 | 11
GPS     | L2  | 1 | 12
GPS     | L2  | 4 | 11
GPS     | L2  | 4 | 12
GALILEO | E5b | 2 | 10
GLONASS | G1  | 1 | 10
GLONASS | G1  | 1 | 11
GLONASS | G1  | 2 | 10

the answer would be:

GPS     | L1  | 1 | 3
GPS     | L2  | 1 | 2
GPS     | L2  | 1 | 1
GPS     | L2  | 4 | 2
GALILEO | E5b | 2 | 1
GLONASS | G1  | 1 | 3

I'm a bit rusty but I think a pseudo-SQL equivalent would be something like:

SELECT sat_stystem, band, index, COUNT(time)
GROUP_BY sat_stystem, band, index

I have found this related question: How to get the distinct count of first key in boost::multi_index_container with composite key

but it is specific for one search query, while I would like to have a result for the whole table instead.


Solution

  • There's no prefab functionality for that, but you can basically do it yourself the same way a SQL planner would do internally:

    Live Coliru Demo

    #include <boost/multi_index_container.hpp>
    #include <boost/multi_index/composite_key.hpp>
    #include <boost/multi_index/member.hpp>
    #include <boost/multi_index/ordered_index.hpp>
    #include <string>
    
    using namespace boost::multi_index;
    
    struct SatData {
        std::string sat_system;
        std::string band;
        int index;
        uint64_t time;
        double data;
        
        SatData(
            const std::string& ss,
            const std::string& b,
            const int& i,
            const uint64_t& t,
            const double& d) : 
            sat_system(ss), band(b), index(i), time(t), data(d) {}
    };
    
    
    using SatDataset =
    multi_index_container<
        SatData, indexed_by<                          
            ordered_unique< // given a specific satellite and a time, the measurement is unique
                composite_key<
                    SatData,
                    member<SatData, std::string, &SatData::sat_system>,
                    member<SatData, std::string, &SatData::band>,
                    member<SatData, int, &SatData::index>,
                    member<SatData, uint64_t, &SatData::time>
                > // composite key
            > // ordered_not_unique
        > // indexed_by
    >; // multi_index_container
    
    #include <iostream>
                          
    int main()
    {
      SatDataset s = {
        {"GPS"     , "L1"  , 1 , 10, 0.112},
        {"GPS"     , "L1"  , 1 , 11, 0.201},
        {"GPS"     , "L1"  , 1 , 12, 0.100},
        {"GPS"     , "L2"  , 1 , 10, 0.098},
        {"GPS"     , "L2"  , 1 , 11, 0.134},
        {"GPS"     , "L2"  , 1 , 12, 0.167},
        {"GPS"     , "L2"  , 4 , 11, 0.199},
        {"GPS"     , "L2"  , 4 , 12, 0.204},
        {"GALILEO" , "E5b" , 2 , 10, 0.056},
        {"GLONASS" , "G1"  , 1 , 10, 0.123},
        {"GLONASS" , "G1"  , 1 , 11, 0.222},
        {"GLONASS" , "G1"  , 2 , 10, 0.115},
      };
      
      for(auto first = s.begin(), last = s.end(); first != last; ) {
        auto next = s.upper_bound(std::make_tuple(
            first->sat_system,
            first->band,
            first->index));
        std::cout
            << first->sat_system << "\t|"
            << first->band << "\t|"
            << first->index << "\t|"
            << std::distance(first, next) << "\n";
        first = next;
      }
    }
    

    Output

    GALILEO |E5b |2 |1
    GLONASS |G1  |1 |2
    GLONASS |G1  |2 |1
    GPS     |L1  |1 |3
    GPS     |L2  |1 |3
    GPS     |L2  |4 |2
    

    Also, if you're going to do many of these operations, considering using ranked indices:

    Live Coliru Demo

    #include <boost/multi_index_container.hpp>
    #include <boost/multi_index/composite_key.hpp>
    #include <boost/multi_index/member.hpp>
    #include <boost/multi_index/ranked_index.hpp>
    #include <string>
    
    using namespace boost::multi_index;
    
    struct SatData {
        std::string sat_system;
        std::string band;
        int index;
        uint64_t time;
        double data;
        
        SatData(
            const std::string& ss,
            const std::string& b,
            const int& i,
            const uint64_t& t,
            const double& d) : 
            sat_system(ss), band(b), index(i), time(t), data(d) {}
    };
    
    
    using SatDataset =
    multi_index_container<
        SatData, indexed_by<                          
            ranked_unique< // given a specific satellite and a time, the measurement is unique
                composite_key<
                    SatData,
                    member<SatData, std::string, &SatData::sat_system>,
                    member<SatData, std::string, &SatData::band>,
                    member<SatData, int, &SatData::index>,
                    member<SatData, uint64_t, &SatData::time>
                > // composite key
            > // ordered_not_unique
        > // indexed_by
    >; // multi_index_container
    
    #include <iostream>
                          
    int main()
    {
      SatDataset s = {
        {"GPS"     , "L1"  , 1 , 10, 0.112},
        {"GPS"     , "L1"  , 1 , 11, 0.201},
        {"GPS"     , "L1"  , 1 , 12, 0.100},
        {"GPS"     , "L2"  , 1 , 10, 0.098},
        {"GPS"     , "L2"  , 1 , 11, 0.134},
        {"GPS"     , "L2"  , 1 , 12, 0.167},
        {"GPS"     , "L2"  , 4 , 11, 0.199},
        {"GPS"     , "L2"  , 4 , 12, 0.204},
        {"GALILEO" , "E5b" , 2 , 10, 0.056},
        {"GLONASS" , "G1"  , 1 , 10, 0.123},
        {"GLONASS" , "G1"  , 1 , 11, 0.222},
        {"GLONASS" , "G1"  , 2 , 10, 0.115},
      };
      
      std::size_t rank_first = 0;  
      for(auto first = s.begin(), last = s.end(); first != last; ) {
        auto rank_next = s.upper_bound_rank(std::make_tuple(
            first->sat_system,
            first->band,
            first->index));
        std::cout
            << first->sat_system << "\t|"
            << first->band << "\t|"
            << first->index << "\t|"
            << rank_next - rank_first << "\n";
        rank_first = rank_next;
        first = s.nth(rank_first);
      }
    }
    

    You need to measure in order to know which approach is faster, but in principle ranked indices will get the group sizes (rank_next - rank_first) in logarithmic time, whereas ordered indices are linear on the number of elements in each group.

    Updated Aug 30

    As per the OP's additional question, getting other calculated values, such as measurement averaging, is straightworfward once we have the range on which to do the calculation:

    Live Coliru Demo

    #include <boost/multi_index_container.hpp>
    #include <boost/multi_index/composite_key.hpp>
    #include <boost/multi_index/member.hpp>
    #include <boost/multi_index/ordered_index.hpp>
    #include <string>
    
    using namespace boost::multi_index;
    
    struct SatData {
        std::string sat_system;
        std::string band;
        int index;
        uint64_t time;
        double data;
        
        SatData(
            const std::string& ss,
            const std::string& b,
            const int& i,
            const uint64_t& t,
            const double& d) : 
            sat_system(ss), band(b), index(i), time(t), data(d) {}
    };
    
    
    using SatDataset =
    multi_index_container<
        SatData, indexed_by<                          
            ordered_unique< // given a specific satellite and a time, the measurement is unique
                composite_key<
                    SatData,
                    member<SatData, std::string, &SatData::sat_system>,
                    member<SatData, std::string, &SatData::band>,
                    member<SatData, int, &SatData::index>,
                    member<SatData, uint64_t, &SatData::time>
                > // composite key
            > // ordered_not_unique
        > // indexed_by
    >; // multi_index_container
    
    #include <iostream>
    #include <numeric>
                          
    int main()
    {
      SatDataset s = {
        {"GPS"     , "L1"  , 1 , 10, 0.112},
        {"GPS"     , "L1"  , 1 , 11, 0.201},
        {"GPS"     , "L1"  , 1 , 12, 0.100},
        {"GPS"     , "L2"  , 1 , 10, 0.098},
        {"GPS"     , "L2"  , 1 , 11, 0.134},
        {"GPS"     , "L2"  , 1 , 12, 0.167},
        {"GPS"     , "L2"  , 4 , 11, 0.199},
        {"GPS"     , "L2"  , 4 , 12, 0.204},
        {"GALILEO" , "E5b" , 2 , 10, 0.056},
        {"GLONASS" , "G1"  , 1 , 10, 0.123},
        {"GLONASS" , "G1"  , 1 , 11, 0.222},
        {"GLONASS" , "G1"  , 2 , 10, 0.115},
      };
      
      for(auto first = s.begin(), last = s.end(); first != last; ) {
        auto next = s.upper_bound(std::make_tuple(
            first->sat_system,
            first->band,
            first->index));
        auto dist = std::distance(first, next);
        auto mean = std::accumulate(
            first, next, 0.0,
            [](double acc, const SatData& x) { return acc + x.data; }) / dist;
        std::cout
            << first->sat_system << "\t|"
            << first->band << "\t|"
            << first->index << "\t|"
            << dist << "\t|"
            << mean << "\n";
        first = next;
      }
    }
    

    Output

    GALILEO |E5b    |2  |1  |0.056
    GLONASS |G1     |1  |2  |0.1725
    GLONASS |G1     |2  |1  |0.115
    GPS     |L1     |1  |3  |0.137667
    GPS     |L2     |1  |3  |0.133
    GPS     |L2     |4  |2  |0.2015