c++unicodecharactercodecvt

Index and histogram of Unicode characters in input using C++


count the occurrences of each symbol and the location they appear in the text, word or line I have a list of words like so in many languages.

What I am trying to do it count the occurrences of each character and the position in the text where they are, or are common. also if it's possible to count the common number of syllables that would also be helpful.

sommige
disa
بَعْض - ba'th
mi qani - մի քանի
bəzi
batzuk
nyeykі/nyeykaya/nyeykaye/nyeykіya - нейкі/нейкая/нейкае/нейкія
kisu - কিসু
afouhe - بعض
neki
alguns
njakoj - някой
一些
algú/alguns/alguna/algunes
neki
někteří
nogle
berekhey āz - برخی از
een paar
kam - كام
some
iuj
mõned
berekhey āz - برخی از
ilan
joitakin
sommige
certains
algúns
ramdenime - რამდენიმე
einige
peripou - περίπου
keṭelāk - કેટલાક 
wasu
kèk
khemeh - כמה
kuch - कुछ
néhány
sumir
beberapa
roinnt
alcuni
ikutsu ka no - いくつかの
kelavu
មួយចំនួន
조금 - jo geum
هەندێک
aliquis
daži
keli
nekoi - некои
misy
beberapa
ഏതാനും
xi
yī xiē  - 一些
kaahi - कांही
neki
shwiya - بعض
kehi - केही
enkelte
gari
berekhey āz - برخی از
b'eda - بعضی
kilka
ਕਈ
alguns
câţiva/câteva
некоторые - nekotorыe

some
neki - неки
samahara - සමහර
niektorí
nekaj
algunos
baadhi
några
ilan
yakchand - якчанд
konjam - கொஞ்சம்
yan
konni - కొన్ని
บาง - baang
bazı
dejakі - деякі
chened - چند
ba'zi, qandaydir
một số
rhai
עטלעכע
die
okumbalwa

this is the current code sehe made it work with unicode

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif

using namespace std;

std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};

int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");

#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);

    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());

    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

this was my original code

 #include <iostream>
  #include <cctype>
 #include <fstream>
#include <string>
 #include <map>
  #include <istream>
   #include <vector>
 #include <list>
 #include <algorithm>
#include <iterator>


using namespace std;
 struct letter_only: std::ctype<char> 
 {
    letter_only(): std::ctype<char>(get_table()) {}

    static std::ctype_base::mask const* get_table()
    {
       static std::vector<std::ctype_base::mask> 
             rc(std::ctype<char>::table_size,std::ctype_base::space);

       std::fill(&rc['A'], &rc['z'+1], std::ctype_base::alpha);
       return &rc[0];
    }
 };

struct Counter
{
    std::map<char, int> letterCount;
    void operator()(char  item) 
    { 
       if ( item != std::ctype_base::space)
         ++letterCount[tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
    operator std::map<char, int>() { return letterCount ; }
};

int main()
{
     ifstream input;
     input.imbue(std::locale(std::locale(), new letter_only())); //enable reading only leters only!
     input.open("T");
     istream_iterator<char> start(input);
     istream_iterator<char> end;
     std::map<char, int> letterCount = std::for_each(start, end, Counter());
     for (std::map<char, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
     {
          cout << it->first <<" : "<< it->second << endl;
     }
 }

Example of what I am trying to get as out put

к : 10 (2,5) (1,5,8) (2,7) (1,3,5)

the letter that is found K then the number of occurrences it was found 10 then the locations in each word where it was found as mentioned before.


Solution

  • Here's what I got, and it seems to works quite well on my machine1.

    //#define PREFER_BOOST
    #include <iostream>
    #include <fstream>
    #include <string>
    #include <map>
    #include <istream>
    #include <algorithm>
    #include <iterator>
    #ifdef PREFER_BOOST
    #include <boost/locale.hpp>
    #endif
    
    using namespace std;
    
    std::map<wchar_t, int> letterCount;
    struct Counter
    {
        void operator()(wchar_t  item) 
        { 
            if ( !std::isspace(item) )
                ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
        }
    };
    
    int main()
    {
        std::setlocale(LC_ALL, "en_US.UTF-8");
        wifstream input("input.txt");
    
    #ifdef PREFER_BOOST 
        boost::locale::generator gen;
        std::locale loc = gen("en_US.UTF-8"); 
    #else
        std::locale loc("en_US.UTF-8");
    #endif
        input.imbue(loc);
        wcout.imbue(loc);
    
        istreambuf_iterator<wchar_t> start(input), end;
        std::for_each(start, end, Counter());
    
        for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
        {
            wcout << it->first <<" : "<< it->second << endl;
        }
    }
    

    If you prefer the boost locale library, you need to link to boost_system, boost_locale and boost_thread; I didn't see any noticeable difference in behaviour

    Output:

    ' : 3 , : 1 - : 32 / : 10 a : 67 b : 16 c : 7
    d : 12 e : 61 f : 1 g : 16 h : 17 i : 46 j : 8
    k : 41 l : 19 m : 19 n : 47 o : 20 p : 5 q : 3
    r : 18 s : 21 t : 12 u : 21 v : 3 w : 3 x : 2
    y : 21 z : 7 á : 1 â : 2 å : 1 è : 1 é : 1
    í : 2 õ : 1 ú : 2 ā : 4 ē : 1 ě : 1 ī : 1
    ı : 1 ř : 1 ţ : 1 ž : 1 ə : 1 ί : 1 ε : 1
    ο : 1 π : 2 ρ : 1 υ : 1 а : 3 д : 2 е : 10
    и : 2 й : 5 к : 10 н : 9 о : 4 р : 1 т : 1
    ч : 1 ы : 2 я : 5 і : 6 ա : 1 ի : 2 մ : 1
    ն : 1 ք : 1 ה : 1 ט : 1 כ : 2 ל : 1 מ : 1
    ע : 3 ا : 4 ب : 7 خ : 3 د : 2 ر : 3 ز : 3
    ض : 4 ع : 4 ك : 1 م : 1 ن : 2 ه : 1 َ : 1
     : 1 چ : 1 ک : 1 ی : 4 ێ : 1 ە : 1 ं : 1
    ी : 2 ु : 1 े : 1 ক : 1 ি : 1 ু : 1 ਕ : 1
    ક : 2 ટ : 1 ે : 1 க : 1 ் : 2 క : 1 ి : 1
     : 1 ഏ : 1 ു : 1 ර : 1 ස : 1 ง : 1 า : 1
    ა : 1 დ : 1 ე : 2 ი : 1 მ : 2 ნ : 1 რ : 1
    ច : 1 ន : 2 ម : 1 យ : 1 ួ : 2 ំ : 1 ố : 1
    ộ : 1 い : 1 か : 1 く : 1 の : 1 一 : 2 些 : 2
    금 : 1
    

    1. I might not get all characters displayed, but it might be due to my terminal font.