sqlgoogle-bigquerytext-analysis

How can I compute TF/IDF with SQL (BigQuery)


I'm doing text analysis over reddit comments, and I want to calculate the TF-IDF within BigQuery.


Solution

  • This one might be easier to understand - takes a dataset that already has the # of words per TV station and day:

    # in this query the combination of date+station represents a "document"
    
    WITH data AS (
      SELECT *
      FROM `gdelt-bq.gdeltv2.iatv_1grams`
      WHERE DATE BETWEEN 20190601 AND 20190629
      AND station NOT IN ('KSTS', 'KDTV')
    )
    , word_day_station AS (
      # how many times a word is mentioned in each "document"
      SELECT word, SUM(count) counts, date, station
      FROM data
      GROUP BY 1, 3, 4
    )
    , day_station AS (
      # total # of words in each "document" 
      SELECT SUM(count) counts, date, station
      FROM data
      GROUP BY 2,3
    )
    , tf AS (
      # TF for a word in a "document"
      SELECT word, date, station, a.counts/b.counts tf
      FROM word_day_station a
      JOIN day_station b
      USING(date, station)
    )
    , word_in_docs AS (
      # how many "documents" have a word
      SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
      FROM word_day_station
      GROUP BY 1
    )
    , total_docs AS (
      # total # of docs
      SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
      FROM data
    )
    , idf AS (
      # IDF for a word
      SELECT word, LOG(total_docs.total_docs/indocs) idf
      FROM word_in_docs
      CROSS JOIN total_docs
    )
    
    SELECT date,
      ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
    FROM (
      SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
      FROM (
        SELECT word, date, station, tf.tf * idf.idf tfidf
        FROM tf
        JOIN idf
        USING(word)
      )
      GROUP BY date, station
    )
    GROUP BY date
    ORDER BY date DESC
    

    enter image description here