I am doing twitter streaming data by kafka. I managed to stream the data and consume the twitter json. But now how do i create a pyspark dataframe containing the twitter data and the search keyword?
Below is how i write the kafka producer
I managed to create the dataframe of what data i want from the twitter object. But i don't know how to get the search keyword.
class StdOutListener(StreamListener):
def __init__(self, producer):
self.producer_obj = producer
#on_status is activated whenever a tweet has been heard
def on_data(self, data):
try:
self.producer_obj.send("twitterstreamingdata", data.encode('utf-8'))
print(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
# When an error occurs
def on_error(self, status):
print (status)
return True
# When reach the rate limit
def on_limit(self, track):
# Print rate limiting error
print("Rate limited, continuing")
# Continue mining tweets
return True
# When timed out
def on_timeout(self):
# Print timeout message
print(sys.stderr, 'Timeout...')
# Wait 10 seconds
time.sleep(120)
return True # To continue listening
def on_disconnect(self, notice):
#Called when twitter sends a disconnect notice
return
if __name__ == '__main__':
spark = SparkSession \
.builder \
.appName("Kafka Producer Application") \
.getOrCreate()
#This is the initialization of Kafka producer
producer = KafkaProducer(bootstrap_servers='xx.xxx.xxx.xxx:9092')
#This handles twitter auth and the conn to twitter streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener(producer))
print("Kafka Producer Application: ")
WORDS = input("Enter any words: ")
print ("Is this what you just said?", WORDS)
word = [u for u in WORDS.split(',')]
#This line filter twitter stream to capture data by keywords
stream.filter(track=word)
One way to resolve your problem it's changing StdOutListener class constructor to receive "keyword" parameter and add "keyword" to JSON in "on_data" function to send to Kafka
import json
import sys
import time
from kafka import KafkaProducer
from pyspark.sql import SparkSession
from tweepy import StreamListener, Stream, OAuthHandler
class StdOutListener(StreamListener):
def __init__(self, producer: KafkaProducer = None, keyword=None):
super(StreamListener, self).__init__()
self.producer = producer
self.keyword = keyword
# on_status is activated whenever a tweet has been heard
def on_data(self, data):
try:
data = json.loads(data)
data['keyword'] = self.keyword
data = json.dumps(data)
self.producer.send("twitterstreamingdata", data.encode('utf-8'))
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
# When an error occurs
def on_error(self, status):
print(status)
return True
# When reach the rate limit
def on_limit(self, track):
# Print rate limiting error
print("Rate limited, continuing")
# Continue mining tweets
return True
# When timed out
def on_timeout(self):
# Print timeout message
print(sys.stderr, 'Timeout...')
# Wait 10 seconds
time.sleep(120)
return True # To continue listening
def on_disconnect(self, notice):
# Called when twitter sends a disconnect notice
return
if __name__ == '__main__':
CONSUMER_KEY = 'YOUR CONSUMER KEY'
CONSUMER_SECRET = 'YOUR CONSUMER SECRET'
ACCESS_TOKEN = 'YOUR ACCESS TOKEN'
ACCESS_SECRET = 'YOUR ACCESS SECRET'
print("Kafka Producer Application: ")
words = input("Enter any words: ")
print("Is this what you just said?", words)
word = [u for u in words.split(',')]
spark = SparkSession \
.builder \
.appName("Kafka Producer Application") \
.getOrCreate()
# This is the initialization of Kafka producer
kafka_producer = KafkaProducer(bootstrap_servers='35.240.157.219:9092')
# This handles twitter auth and the conn to twitter streaming API
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, StdOutListener(producer=kafka_producer, keyword=word))
stream.filter(track=word)
Hope it helps you!