c++c-strings

How do I remove repeated words from a string and only show it once with their wordcount


Basically, I have to show each word with their count but repeated words show up again in my program.

How do I remove them by using loops or should I use 2d arrays to store both the word and count?

#include <iostream>
#include <stdio.h>
#include <iomanip>
#include <cstring>
#include <conio.h>
#include <time.h>
using namespace std;

char* getstring();
void xyz(char*);
void tokenizing(char*);

int main()
{
    char* pa = getstring();
    xyz(pa);
    tokenizing(pa);

    _getch();
}

char* getstring()
{
    static char pa[100];
    cout << "Enter a paragraph: " << endl;
    cin.getline(pa, 1000, '#');

    return pa;
}
void xyz(char* pa)
{
    cout << pa << endl;
}
void tokenizing(char* pa)
{
    char sepa[] = " ,.\n\t";
    char* token;
    char* nexttoken;
    int size = strlen(pa);
    token = strtok_s(pa, sepa, &nexttoken);
    while (token != NULL) {
        int wordcount = 0;
        if (token != NULL) {
            int sizex = strlen(token);
            //char** fin;
            int j;
            for (int i = 0; i <= size; i++) {
                for (j = 0; j < sizex; j++) {
                    if (pa[i + j] != token[j]) {
                        break;
                    }
                }
                if (j == sizex) {
                    wordcount++;
                }
            }
            //for (int w = 0; w < size; w++)
            //fin[w] =  token;
            //cout << fin[w];

            cout << token;
            cout << " " << wordcount << "\n";
        }
        token = strtok_s(NULL, sepa, &nexttoken);
    }
}

This is the output I get:

enter image description here

I want to show, for example, the word "i" once with its count of 5, and then not show it again.


Solution

  • This post gives an example to save each word from your 'strtok' function into a vector of string. Then, use string.compare to have each word compared with word[0]. Those indexes match with word[0] are marked in an int array 'used'. The count of match equals to the number marks in the array used ('nused'). Those words of marked are then removed from the vector, and the remaining carries on to the next comparing process. The program ends when no word remained.

    You may write a word comparing function to replace 'str.compare(str2)', if you prefer not to use std::vector and std::string.

    #include <iostream>
    #include <string>
    #include <vector>
    #include<iomanip>
    #include<cstring>
     using namespace std;
          
     char* getstring();
     void xyz(char*);
     void tokenizing(char*);
     
     int main()
     {
        char* pa = getstring();
        xyz(pa);
        tokenizing(pa);
     }
    
     
    char* getstring()
    {
       static char pa[100] = "this is a test and is a test and is test.";
       return pa;
    }
    void xyz(char* pa)
    {
      cout << pa << endl;
    }
    void tokenizing(char* pa)
    {
       char sepa[] = " ,.\n\t";
       char* token;
       char* nexttoken;
       std::vector<std::string> word;
       int used[64];
       std::string tok;
       int nword = 0, nsize, nused;
       int size = strlen(pa);
       token = strtok_s(pa, sepa, &nexttoken);
       while (token)
       {
          word.push_back(token);
          ++nword;
          token = strtok_s(NULL, sepa, &nexttoken);
       }
       for (int i = 0; i<nword; i++) std::cout << word[i] << std::endl;
       std::cout << "total " << nword << " words.\n" << std::endl;
       nsize = nword;
       while (nsize > 0)
       {
           nused = 0;
           tok = word[0] ;
           used[nused++] = 0;
           for (int i=1; i<nsize; i++)
           {
               if ( tok.compare(word[i]) == 0 )
               {
                  used[nused++] = i; }
           }
           std::cout  << tok << " : " << nused << std::endl;
           for (int i=nused-1; i>=0; --i)
           {
              for (int j=used[i]; j<(nsize+i-nused); j++) word[j] = word[j+1];
           }
           nsize -= nused;
       }
    }
    

    Notice that the removal of used words has to do in backward order. If you do it in sequential order, the marked indexes in the 'used' array will need to be changed. A running test:

    $ ./a.out
    this is a test and is a test and is test.
    this
    is
    a
    test
    and
    is
    a
    test
    and
    is
    test
    total 11 words.
    
    this : 1
    is : 3
    a : 2
    test : 3
    and : 2