Below is a dummy corpus of 4 documents.
The dictionary was developed to identify the frequency of words or phrases in the corpus, as well as the number of documents a word or phrases occurs in.
The world 'Australians' occurs in two dictionary keys (peep, indig). Key content is intended to be mutually exclusive.
Similarly 'Australia' (oz and Australia Post), foreign (foreign and multinat) and farm/farmers (dairy and farmers) occur in two dictionary keys each, but are intended to be counted once, according to the dictionary.
The expected overall frequency count is (extracted from the 'pattern" column of the kwic table) and reported as x2 below. Note the word industry appears but is not allocated to industry because it is define din the indig key.
Dairy is the most frequency occuring key, occuring in three documents. This can calculated from unique rows in the kwic table 'doc names' column for each key.
I have three questions:
library (quanteda)
library(quanteda.textstats)
txt <- c(doc1 = "A significant percent of all farms in Australia, are dairy.
Although there are a lot of dairy farms in this country,
it is not the biggest farm industry. The life of a farmer is not easy, a dairy
farmer has to be an early riser. ",
doc2 = "Australian people like milk so a healthy dairy industry is important in
our country",
doc3 = "Dairy and sheep farms developed at the expense of Indigenous
Australians. Further many companies are now foreign-owned",
doc4 = "Some farmers are lucky to receive a service from Australia Post. Mail is
sent to many foreign countries and received more quickly than
delivered in some locations in Australia.")
x <- x %>%
tokens_compound(phrase("dairy farmers"), concatenator = " ") %>%
tokens_compound(phrase("dairy farms"), concatenator = " ") %>%
tokens_compound(phrase("dairy farm"), concatenator = " ") %>%
tokens_compound(phrase("dairy farming"), concatenator = " ") %>%
tokens_compound(phrase("dairy industry"), concatenator = " ") %>%
tokens_compound(phrase("indigenous australians"), concatenator = " ") %>%
tokens_compound(phrase("australia post"), concatenator = " ") %>%
tokens_compound(phrase("dairy farmer"), concatenator = " ")
x
dict <- dictionary(list(multinat = c("offshore petroleum companies", "foreign-
owned", "foreign owned", "foreign companies", "multinational", "multinational
oil companies", "multinationals", "transnational"),
dairy = c("dairy farmers", "dairy farms","dairy farm","dairy farming","dairy
industry", "dairy farmer","dairy", "milk"),
auspost = "australia post",
oz = c("australia", "this country", "our country"),
farmers = c("farmers", "farmer", "farm", "farms"),
foreign = c("foreign", "foreigner", "foreigners"),
business =c("small business", "business", "businesses", "company", "companies"),
indig = c("aboriginal", "aboriginals", "indigenous australians", "torres
strait"),
peep = c("australians", "people of australia", "australian people", "people of
this nation", "people of this country"),
industry = c("industry", "industries")))
kwicdict <- kwic(x, pattern = dict, window = 4)
write.csv (kwicdict, "D:/Output/TEST.csv")
DF <- read.csv("D://Output/TEST.csv",header=T)
## obtaining frequency count of KWIC table 'pattern ' values
> x2 <- DF[,8]
>
> table (x2)
x2
auspost business dairy farmers foreign indig industry multinat oz peep
1 1 6 5 1 1 1 1 5 2
I don't think that kwic()
is what you want here. tokens_lookup()
lets you specify that the nested scope should be mutually exclusive across keys, not just within keys. Observe the difference below. (And note the use of wildcarding for dairy key.)
library(quanteda)
#> Package version: 4.1.0
#> Unicode version: 14.0
#> ICU version: 71.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
library(quanteda.textstats)
txt <- c(doc1 = "A significant percent of all farms in Australia, are dairy.
Although there are a lot of dairy farms in this country,
it is not the biggest farm industry. The life of a farmer is not easy, a dairy
farmer has to be an early riser. ",
doc2 = "Australian people like milk so a healthy dairy industry is important in
our country",
doc3 = "Dairy and sheep farms developed at the expense of Indigenous
Australians. Further many companies are now foreign-owned",
doc4 = "Some farmers are lucky to receive a service from Australia Post. Mail is
sent to many foreign countries and received more quickly than
delivered in some locations in Australia.")
dict <- dictionary(list(multinat = c("offshore petroleum companies", "foreign-owned",
"foreign owned", "foreign companies", "multinational",
"multinational oil companies", "multinationals", "transnational"),
dairy = c("dairy farm*", "dairy industry", "dairy", "milk"),
auspost = "australia post",
oz = c("australia", "this country", "our country"),
farmers = c("farmers", "farmer", "farm", "farms"),
foreign = c("foreign", "foreigner", "foreigners"),
business =c("small business", "business", "businesses", "company", "companies"),
indig = c("aboriginal", "aboriginals", "indigenous australians", "torres strait"),
peep = c("australians", "people of australia", "australian people",
"people of this nation", "people of this country"),
industry = c("industry", "industries")))
x <- tokens(txt)
# with overlap
tokens_lookup(x, dict) |>
dfm()
#> Document-feature matrix of: 4 documents, 10 features (55.00% sparse) and 0 docvars.
#> features
#> docs multinat dairy auspost oz farmers foreign business indig peep industry
#> doc1 0 3 0 2 5 0 0 0 0 1
#> doc2 0 2 0 1 0 0 0 0 1 1
#> doc3 1 1 0 0 1 0 1 1 1 0
#> doc4 0 0 1 2 1 1 0 0 0 0
# without overlap
tokens_lookup(x, dict, nested_scope = "dictionary") |>
dfm()
#> Document-feature matrix of: 4 documents, 10 features (60.00% sparse) and 0 docvars.
#> features
#> docs multinat dairy auspost oz farmers foreign business indig peep industry
#> doc1 0 3 0 2 3 0 0 0 0 1
#> doc2 0 2 0 1 0 0 0 0 1 0
#> doc3 1 1 0 0 1 0 1 1 0 0
#> doc4 0 0 1 1 1 1 0 0 0 0
Created on 2024-10-06 with reprex v2.1.1