c++unordered-multiset

C++ vector of strings into associative vector of ints


Im having trouble to convert a string vector with size of ~ 1.0000.0000 elements to an associative vector with integers.

Input:

std::vector<std::string> s {"a","b","a","a","c","d","a"};

Desired output:

std::vector<int> i {0,1,0,0,2,3,0};

I was thinking of an std::unordered_multiset as mentioned in Associative Array with Vector in C++ but i can't get it running.

The goal is to reduce the time it takes to convert c++ strings to R strings, which is so much faster if I just use numbers.

Thank you for your help!

Edit:

Thats how I tried to populate the set:

for (size_t i = 0; i < s.size(); i++)
{
        set.insert(s[i]);
}

Solution

  • This code will output your desired output for your given input. And it will process 1.000.000 strings of length 3 in 0.4s. So I think unordered_map is a viable choice.

    #include <string>
    #include <iostream>
    #include <unordered_map>
    #include <chrono>
    #include <random>
    
    // generator function for creating a large number of strings.
    std::vector<std::string> generate_strings(const std::size_t size, const std::size_t string_length)
    {
        static std::random_device rd{};
        static std::default_random_engine generator{ rd() };
        static std::uniform_int_distribution<int> distribution{ 'a', 'z' };
    
        std::vector<std::string> strings;
        std::string s(string_length, ' ');
    
        for (std::size_t n = 0; n < size; n++)
        {
            for (std::size_t m = 0; m < string_length; ++m)
            {
                s[m] = static_cast<char>(distribution(generator));
            }
    
            strings.emplace_back(s);
        }
    
        return strings;
    }
    
    int main() 
    {
        std::vector<std::string> strings = generate_strings(1000000, 3);
        //std::vector<std::string> strings{ "a","b","a","a","c","d","a" };
    
        std::unordered_map<std::string, int> map;
        std::vector<int> output;
    
        // speed optimization, allocate enough room for answer
        // so output doesn't have to reallocate when growing.
        output.reserve(strings.size());
    
        auto start = std::chrono::high_resolution_clock::now();
    
        int id = 0;
        for (const auto& string : strings)
        {
            if (map.find(string) == map.end())
            {
                output.push_back(id);
                map.insert({ string, id });
                id++;
            }
            else
            {
                output.push_back(map.at(string));
            }
        }
    
        auto duration = std::chrono::high_resolution_clock::now() - start;
        auto nanoseconds = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
    
        auto seconds = static_cast<double>(nanoseconds) / 1.0e9;
    
        /*
        for (const auto& value : output)
        {
            std::cout << value << " ";
        }
        */
    
    }