I want to use tokens_compound to examine the frequency of phrases in the documents of a corpus. I used the corpus data_corpus_inaugural for illustrative purposes and selected some ngrams to search for. I want to be able to save the output into a csv file. The code below, produces an output file, but no phrase is identified. Advice regarding how to correctly identify the frequency of phrases via a dictionary are appreciated.
library("quanteda")
## Package version: 2.1.2
data(data_corpus_inaugural)
toks <- data_corpus_inaugural %>%
tokens(remove_punct = TRUE,
remove_symbol = TRUE,
padding = TRUE) %>%
tokens_tolower()
tokens <- dfm(toks)
multiword <- c("the house of representatives", "the senate", "foreign legislative","fellow citiznes", "men of reflection", "total independence", "unlimited sumission","no middle course",
"apprehension of danger", "formidable power")
comp_toks <- tokens_compound(toks, pattern = phrase(multiword))
dictx <- dictionary(list(govt = c("the_house_of_representatives", "the_senate", "foreign_legislative"),
people = c("fellow_citizens", "men_of_reflection"),
action =c("total_independence", "unlimited_sumission"),
course ="no_middle_course",
energy = c("apprehension_of_danger", "formidable_power")))
test <- dfm_lookup(tokens, dictionary = dictx)
test2 <- convert(test , to = "data.frame")
write.csv (test2, "D:/Test.csv")
You need to search the multi-word expressions before forming a DFM.
library("quanteda")
#> Package version: 4.0.2
#> Unicode version: 15.1
#> ICU version: 74.1
#> Parallel computing: 16 of 16 threads used.
#> See https://quanteda.io for tutorials and examples.
data(data_corpus_inaugural)
toks <- data_corpus_inaugural %>%
tokens(remove_punct = TRUE,
remove_symbol = TRUE,
padding = TRUE)
dict <- dictionary(list(govt = c("the house of representatives", "the senate", "foreign legislative"),
people = c("fellow citizens", "men of reflection"),
action =c("total independence", "unlimited sumission"),
course ="no middle course",
energy = c("apprehension of danger", "formidable power")))
dfmt <- dfm(tokens_lookup(toks, dictionary = dict))
dat <- convert(dfmt, to = "data.frame")
head(dat)
#> doc_id govt people action course energy
#> 1 1789-Washington 3 2 0 0 0
#> 2 1793-Washington 0 1 0 0 0
#> 3 1797-Adams 0 1 1 1 1
#> 4 1801-Jefferson 0 5 0 0 0
#> 5 1805-Jefferson 0 8 0 0 0
#> 6 1809-Madison 0 0 0 0 0
dfmt2 <- dfm(tokens_compound(toks, dict)) %>%
dfm_select(dict)
dat2 <- convert(dfmt2, to = "data.frame")
head(dat2)
#> doc_id the_senate the_house_of_representatives fellow_citizens
#> 1 1789-Washington 1 2 2
#> 2 1793-Washington 0 0 1
#> 3 1797-Adams 0 0 0
#> 4 1801-Jefferson 0 0 5
#> 5 1805-Jefferson 0 0 8
#> 6 1809-Madison 0 0 0
#> no_middle_course total_independence men_of_reflection formidable_power
#> 1 0 0 0 0
#> 2 0 0 0 0
#> 3 1 1 1 1
#> 4 0 0 0 0
#> 5 0 0 0 0
#> 6 0 0 0 0
Created on 2024-07-09 with reprex v2.1.1