texttidyversetext-miningtmunnest

How can I extract bigrams from text without removing the hash symbol?


I am using the following function (based on https://rpubs.com/sprishi/twitterIBM) to extract bigrams from text. However, I want to keep the hash symbol for analysis purposes. The function to clean text works fine, but the unnest tokens function removes special characters. Is there any way to run unnest tokens without removing special characters?

x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))

clean_Twitter_Corpus <- function(x) {
x  =  tolower(x)                          # convert to lower case characters
x  =  stripWhitespace(x)                  # removing white space
x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
x  = removeWords(x,stopwords("english"))  # remove stopwords
return(x)
}    

# clean the twitter texts. call the clean_Twitter_Corpus function
tweets <- clean_Twitter_Corpus(x)
tweets
text <- as.character(tweets)
text <- as.data.frame(text)

tidy_descr_ngrams <- text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ")
tidy_descr_ngrams

bigram_counts <- tidy_descr_ngrams %>%
  count(word1, word2, sort = TRUE)

bigram_counts   

Solution

  • Here is a solution that involving create a custom n-grams function

    Setup

    library(tidyverse)
    library(tidytext)
    library(tm)
    library(purrr)
    
    x <- (c("I went to afternoon tea with her majesty and #queen @Victoria in the palace.", "Does tea have extra caffeine?"))
    
    clean_Twitter_Corpus <- function(x) {
      x  =  tolower(x)                          # convert to lower case characters
      x  =  stripWhitespace(x)                  # removing white space
      x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
      x  = removeWords(x,stopwords("english"))  # remove stopwords
      return(x)
    }
    

    The custom function that create n grams without removing special character

    # A custom build function that will take in a sentence and create
    # a tibble of ngrams
    ngrams_build = function(sentence, column_name, n = 2) {
      words <- sentence %>% str_split(pattern = " ", simplify = TRUE) 
      words <- words[words != ""]
      ngrams <- map_chr(1:(length(words) - n + 1),
                        .f = function(x, words, n) {
                          paste(words[x:(x + n - 1)], collapse = " ")
                        }, words = words, n = n)
      tibble(!!column_name := ngrams)
    }
    

    Your code again

    # clean the twitter texts. call the clean_Twitter_Corpus function
    tweets <- clean_Twitter_Corpus(x)
    tweets
    #> [1] " went  afternoon tea   majesty  #queen @victoria   palace."
    #> [2] " tea  extra caffeine?"
    text <- as.character(tweets)
    text <- as.data.frame(text)
    
    tidy_descr_ngrams <- 
      # here I use purrr function with the custom function
      map_dfr(text$text, ngrams_build, column_name = "bigram", n = 2) %>%
      separate(bigram, c("word1", "word2"), sep = " ")
    
    # Here is the output which is similar to unnest_tokens but has special
    # character included
    tidy_descr_ngrams
    #> # A tibble: 8 x 2
    #>   word1     word2    
    #>   <chr>     <chr>    
    #> 1 went      afternoon
    #> 2 afternoon tea      
    #> 3 tea       majesty  
    #> 4 majesty   #queen   
    #> 5 #queen    @victoria
    #> 6 @victoria palace.  
    #> 7 tea       extra    
    #> 8 extra     caffeine?
    
    

    Final results

    bigram_counts <- tidy_descr_ngrams %>%
      count(word1, word2, sort = TRUE)
    
    bigram_counts
    #> # A tibble: 8 x 3
    #>   word1     word2         n
    #>   <chr>     <chr>     <int>
    #> 1 #queen    @victoria     1
    #> 2 @victoria palace.       1
    #> 3 afternoon tea           1
    #> 4 extra     caffeine?     1
    #> 5 majesty   #queen        1
    #> 6 tea       extra         1
    #> 7 tea       majesty       1
    #> 8 went      afternoon     1
    

    Created on 2022-01-09 by the reprex package (v2.0.1)