I am using the following function (based on https://rpubs.com/sprishi/twitterIBM) to extract bigrams from text. However, I want to keep the hash symbol for analysis purposes. The function to clean text works fine, but the unnest tokens function removes special characters. Is there any way to run unnest tokens without removing special characters?
x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
clean_Twitter_Corpus <- function(x) {
x = tolower(x) # convert to lower case characters
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
x = removeWords(x,stopwords("english")) # remove stopwords
return(x)
}
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
text <- as.character(tweets)
text <- as.data.frame(text)
tidy_descr_ngrams <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ")
tidy_descr_ngrams
bigram_counts <- tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_counts
Here is a solution that involving create a custom n-grams function
library(tidyverse)
library(tidytext)
library(tm)
library(purrr)
x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
clean_Twitter_Corpus <- function(x) {
x = tolower(x) # convert to lower case characters
x = stripWhitespace(x) # removing white space
x = gsub("^\\s+|\\s+$", "", x) # remove leading and trailing white space
x = removeWords(x,stopwords("english")) # remove stopwords
return(x)
}
# A custom build function that will take in a sentence and create
# a tibble of ngrams
ngrams_build = function(sentence, column_name, n = 2) {
words <- sentence %>% str_split(pattern = " ", simplify = TRUE)
words <- words[words != ""]
ngrams <- map_chr(1:(length(words) - n + 1),
.f = function(x, words, n) {
paste(words[x:(x + n - 1)], collapse = " ")
}, words = words, n = n)
tibble(!!column_name := ngrams)
}
# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
#> [1] " went afternoon tea majesty #queen @victoria palace."
#> [2] " tea extra caffeine?"
text <- as.character(tweets)
text <- as.data.frame(text)
tidy_descr_ngrams <-
# here I use purrr function with the custom function
map_dfr(text$text, ngrams_build, column_name = "bigram", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ")
# Here is the output which is similar to unnest_tokens but has special
# character included
tidy_descr_ngrams
#> # A tibble: 8 x 2
#> word1 word2
#> <chr> <chr>
#> 1 went afternoon
#> 2 afternoon tea
#> 3 tea majesty
#> 4 majesty #queen
#> 5 #queen @victoria
#> 6 @victoria palace.
#> 7 tea extra
#> 8 extra caffeine?
bigram_counts <- tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_counts
#> # A tibble: 8 x 3
#> word1 word2 n
#> <chr> <chr> <int>
#> 1 #queen @victoria 1
#> 2 @victoria palace. 1
#> 3 afternoon tea 1
#> 4 extra caffeine? 1
#> 5 majesty #queen 1
#> 6 tea extra 1
#> 7 tea majesty 1
#> 8 went afternoon 1
Created on 2022-01-09 by the reprex package (v2.0.1)