I have calculated the bag of words for 'yelp.csv', 'yelpp.csv', 'yelpn.csv' and created the matrix of individuals dataset's word frequency. Now, I want to compare the bag of words of yelp with yelpn and check how many words in yelp appears in yelpn and their frequency and store it in a variable as matrix, then same for yelpp. The yelp contains both the positive and negative. yelpp, only the positive and yelpn, only the negative. can anyone complete the code? i donno whether this code is relevant,i hope so.
getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t
#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp
#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn
#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat
#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn
I'm taking text for example corpuses from wiki Text mining. Using tm
package and findFreqTerms
,agrep
function are main points in this approach.
agrep
Searches for approximate matches to pattern (the first argument) within each element of the string x (the second argument) using the generalized Levenshtein edit distance (the minimal possibly weighted number of insertions, deletions and substitutions needed to transform one string into another).
Approach Steps :
texts -> corpuses -> data cleaning -> findfreqterms -> compare with other term doc matrix
library(tm)
c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))
c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))
c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))
# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))
c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)
c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)
c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)
dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))
ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)
#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
find <- agrep(t, ft2)
if(length(find) != 0){
common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
}
}
# Note : this for loop can be substituted by apply family functions if taking time for large text
common.c1c2
contains common words between corpus1 and corpus2 with frequency
> common.c1c2
term freq
1 also 1
2 data 2
3 derived 1
4 deriving 1
5 mining 1
6 pattern 1
7 patterns 1
8 process 1
9 text 1
> ft1
[1] "also" "analytics" "data" "derived" "deriving" "devising" "equivalent"
[8] "highquality" "information" "learning" "means" "mining" "pattern" "patterns"
[15] "process" "referred" "roughly" "statistical" "text" "trends" "typically"
> ft2
[1] "addition" "along" "data" "database" "derived" "deriving"
[7] "evaluation" "features" "finally" "input" "insertion" "interpretation"
[13] "involves" "linguistic" "mining" "others" "output" "parsing"
[19] "patterns" "process" "removal" "structured" "structuring" "subsequent"
[25] "text" "usually" "within"
This solution is not the most efficient one but hope it helps.