rtext-miningqdap

Replace words in text with words generated using all_words


Being pretty new to qdap, I am not sure whether this functionality is present, but it would be great to have something as mentioned below.

My initial dataset.

ID         Keywords
1          112 mills, open heart surgery, great, great job
2          Ausie, open, heart out
3          opened, heartily, 56mg)_job, orders12
4          order, macD

On using all_words() I end up with the following data.

   WORD     FREQ
1  great       2
2  heart       2
3  open        2
4  ausie       1
5  heartily    1
6  job         1
7  macd        1
8  mgjob       1
9  mills       1
10 opened       1
11 order        1
12 orders       1
13 out          1
14 surgery      1

Is there a way in which the main dataset can be replaced by the exact words that are appearing through all_words()?

edit1: So the list that comes from using all_words() should replace the original words in the dataframe, i.e 112 mills should become mills, 56mg)_job should become mgjob.


Solution

  • It is a bit more manual and I do not know how your data are formatted, but with some tinkering should do the work:

    Edit: and it is not using qdap, but I have assumed this is not a crucial part of the question.

    2nd edit: I forgot about the substitution, corrected code below.

    library(data.table)
    library(tm)  # Functions with tm:: below
    library(magrittr)
    
    dt <- data.table(
      ID = 1L:4L,
      Keywords = c(
        paste('112 mills', 'open heart', 'surgery', 'great', 'great job', sep = ' '),
        paste('Ausie', 'open', 'heart out', sep = ' '),
        paste('opened', 'heartily', '56mg)_job', 'orders12', sep = ' '),
        paste('order', 'macD', sep = ' ')))
    
    # dt_2 <- data.table(Tokens = tm::scan_tokenizer(dt[, Keywords]))
    dt_2 <- dt[, .(Tokens = unlist(strsplit(Keywords, split = ' '))), by = ID]
    
    dt_2[, Words := tm::scan_tokenizer(Tokens) %>%
           tm::removePunctuation() %>%
           tm::removeNumbers()
         ]
    dt_2[, Stems := tm::stemDocument(Words)]
    
    dt_2
    #     ID    Tokens    Words    Stems
    #  1:  1       112                  
    #  2:  1     mills    mills     mill
    #  3:  1      open     open     open
    #  4:  1     heart    heart    heart
    #  5:  1   surgery  surgery  surgeri
    #  6:  1     great    great    great
    #  7:  1     great    great    great
    #  8:  1       job      job      job
    #  9:  2     Ausie    Ausie     Ausi
    # 10:  2      open     open     open
    # 11:  2     heart    heart    heart
    # 12:  2       out      out      out
    # 13:  3    opened   opened     open
    # 14:  3  heartily heartily heartili
    # 15:  3 56mg)_job    mgjob    mgjob
    # 16:  3  orders12   orders    order
    # 17:  4     order    order    order
    # 18:  4      macD     macD     macD
    
    # Frequencies
    dt_2[, .N, by = Words]
    #        Words N
    #  1:          1
    #  2:    mills 1
    #  3:     open 2
    #  4:    heart 2
    #  5:  surgery 1
    #  6:    great 2
    #  7:      job 1
    #  8:    Ausie 1
    #  9:      out 1
    # 10:   opened 1
    # 11: heartily 1
    # 12:    mgjob 1
    # 13:   orders 1
    # 14:    order 1
    # 15:     macD 1
    

    2nd edit here:

    res <- dt_2[, .(Keywords = paste(Words, collapse = ' ')), by = ID]
    res
    #    ID                                  Keywords
    # 1:  1  mills open heart surgery great great job
    # 2:  2                      Ausie open heart out
    # 3:  3              opened heartily mgjob orders
    # 4:  4                                order macD
    

    3rd edit, in case your keywords come as lists and you would like to keep them that way.

    library(data.table)
    library(tm)  # Functions with tm:: below
    library(magrittr)
    
    dt <- data.table(
      ID = 1L:4L,
      Keywords = list(
        c('112 mills', 'open heart', 'surgery', 'great', 'great job'),
        c('Ausie', 'open', 'heart out'),
        c('opened', 'heartily', '56mg)_job', 'orders12'),
        c('order', 'macD')))
    
    dt_2 <- dt[, .(Keywords = unlist(Keywords)), by = ID]
    dt_2[, ID_temp := .I]
    
    dt_3 <- dt_2[, .(ID, Tokens = unlist(strsplit(unlist(Keywords), split = ' '))), by = ID_temp]
    
    dt_3[, Words := tm::scan_tokenizer(Tokens) %>%
           tm::removePunctuation() %>%
           tm::removeNumbers() %>%
           stringr::str_to_lower()
         ]
    dt_3[, Stems := tm::stemDocument(Words)]
    dt_3
    
    res <- dt_3[, .(
      ID = first(ID),
      Keywords = paste(Words, collapse = ' ') %>% stringr::str_trim()),
      by = ID_temp]
    res <- res[, .(Keywords = list(Keywords)), by = ID]
    
    # Confirm format (a list of keywords in every element)
    dt[1, Keywords] %T>% {print(class(.))} %T>% {print(length(.[[1]]))}
    res[1, Keywords] %T>% {print(class(.))} %T>% {print(length(.[[1]]))}