dictionarysearchquantedaphrase

Reading output of tokens_compound into a dictionary


I want to use tokens_compound to examine the frequency of phrases in the documents of a corpus. I used the corpus data_corpus_inaugural for illustrative purposes and selected some ngrams to search for. I want to be able to save the output into a csv file. The code below, produces an output file, but no phrase is identified. Advice regarding how to correctly identify the frequency of phrases via a dictionary are appreciated.

library("quanteda")
## Package version: 2.1.2

data(data_corpus_inaugural)

toks <- data_corpus_inaugural %>% 
  tokens(remove_punct = TRUE,
         remove_symbol = TRUE, 
         padding = TRUE) %>% 
  tokens_tolower()

tokens <- dfm(toks)

multiword <- c("the house of representatives", "the senate", "foreign legislative","fellow citiznes", "men of reflection", "total independence", "unlimited sumission","no middle course",
                 "apprehension of danger", "formidable power")
comp_toks <- tokens_compound(toks, pattern = phrase(multiword))


dictx <- dictionary(list(govt = c("the_house_of_representatives", "the_senate", "foreign_legislative"),
                         people = c("fellow_citizens", "men_of_reflection"),
                         action =c("total_independence", "unlimited_sumission"),
                         course ="no_middle_course",
                         energy = c("apprehension_of_danger", "formidable_power")))



test  <-  dfm_lookup(tokens, dictionary = dictx)
test2 <- convert(test , to = "data.frame")
write.csv (test2, "D:/Test.csv")

Solution

  • You need to search the multi-word expressions before forming a DFM.

    library("quanteda")
    #> Package version: 4.0.2
    #> Unicode version: 15.1
    #> ICU version: 74.1
    #> Parallel computing: 16 of 16 threads used.
    #> See https://quanteda.io for tutorials and examples.
    
    
    data(data_corpus_inaugural)
    
    toks <- data_corpus_inaugural %>% 
      tokens(remove_punct = TRUE,
             remove_symbol = TRUE, 
             padding = TRUE)
    
    dict <- dictionary(list(govt = c("the house of representatives", "the senate", "foreign legislative"),
                             people = c("fellow citizens", "men of reflection"),
                             action =c("total independence", "unlimited sumission"),
                             course ="no middle course",
                             energy = c("apprehension of danger", "formidable power")))
    

    Count dictionary key

    
    dfmt <- dfm(tokens_lookup(toks, dictionary = dict))
    dat <- convert(dfmt, to = "data.frame")
    head(dat)
    #>            doc_id govt people action course energy
    #> 1 1789-Washington    3      2      0      0      0
    #> 2 1793-Washington    0      1      0      0      0
    #> 3      1797-Adams    0      1      1      1      1
    #> 4  1801-Jefferson    0      5      0      0      0
    #> 5  1805-Jefferson    0      8      0      0      0
    #> 6    1809-Madison    0      0      0      0      0
    

    Count phrases

    dfmt2 <- dfm(tokens_compound(toks, dict)) %>% 
      dfm_select(dict)
    
    dat2 <- convert(dfmt2, to = "data.frame")
    head(dat2)
    #>            doc_id the_senate the_house_of_representatives fellow_citizens
    #> 1 1789-Washington          1                            2               2
    #> 2 1793-Washington          0                            0               1
    #> 3      1797-Adams          0                            0               0
    #> 4  1801-Jefferson          0                            0               5
    #> 5  1805-Jefferson          0                            0               8
    #> 6    1809-Madison          0                            0               0
    #>   no_middle_course total_independence men_of_reflection formidable_power
    #> 1                0                  0                 0                0
    #> 2                0                  0                 0                0
    #> 3                1                  1                 1                1
    #> 4                0                  0                 0                0
    #> 5                0                  0                 0                0
    #> 6                0                  0                 0                0
    

    Created on 2024-07-09 with reprex v2.1.1