I've got a sample JSON file with about 500 tweets which I'd like to get into a dataframe.
The first three tweets from the JSON file are as follows (urls have been changed deliberately to fit within stackoverflow rules on links):
{"id":"tag:search.twitter.com,2005:413500801899044864","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:860787127","link":"httpee://www.twitter.com/JoeGoodman11","displayName":"Joe Goodman","postedTime":"2012-10-04T03:18:54.000Z","image":"httpes://pbs.twimg.com/profile_images/3781305408/372be07ac2b312d35e1426b264891c4f_normal.jpeg","summary":null,"links":[{"href":null,"rel":"me"}],"friendsCount":21,"followersCount":18,"listedCount":0,"statusesCount":177,"twitterTimeZone":null,"verified":false,"utcOffset":null,"preferredUsername":"JoeGoodman11","languages":["en"],"favoritesCount":286},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for Android","link":"httpee://twitter.com/download/android"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/JoeGoodman11/statuses/413500801899044864","body":"Hard at work studying for finals httpee://t.co/0EumsvUCuI","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500801899044864","summary":"Hard at work studying for finals httpee://t.co/0EumsvUCuI","link":"httpee://twitter.com/JoeGoodman11/statuses/413500801899044864","postedTime":"2013-12-19T02:47:28.000Z"},"favoritesCount":0,"location":{"objectType":"place","displayName":"Lowell, MA","name":"Lowell","country_code":"United States","twitter_country_code":"US","link":"httpes://api.twitter.com/1.1/geo/id/d6539f049c4d05e8.json","geo":{"type":"Polygon","coordinates":[[[-71.382491,42.607189],[-71.382491,42.66676],[-71.271231,42.66676],[-71.271231,42.607189]]]}},"geo":{"type":"Point","coordinates":[42.6428357,-71.33654]},"twitter_entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[],"media":[{"id":413500801395736576,"id_str":"413500801395736576","indices":[33,55],"media_url":"httpee://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg","media_url_https":"httpes://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg","url":"httpee://t.co/0EumsvUCuI","display_url":"pic.twitter.com/0EumsvUCuI","expanded_url":"httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1","type":"photo","sizes":{"medium":{"w":600,"h":339,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":340,"h":192,"resize":"fit"},"large":{"w":1023,"h":579,"resize":"fit"}}}]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"urls":[{"url":"httpee://t.co/0EumsvUCuI","expanded_url":"httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1","expanded_status":200}],"language":{"value":"en"}}}
{"id":"tag:search.twitter.com,2005:413500803593547776","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:168228121","link":"httpee://www.twitter.com/rvzigvdhiv","displayName":"Razi الرازي Gadhia","postedTime":"2010-07-18T19:28:45.000Z","image":"httpes://pbs.twimg.com/profile_images/412269827399495680/44JZWZPz_normal.jpeg","summary":"Why so serious? \n#2005spellingbeechamp \n#wood","links":[{"href":null,"rel":"me"}],"friendsCount":196,"followersCount":300,"listedCount":0,"statusesCount":4236,"twitterTimeZone":"Eastern Time (US & Canada)","verified":false,"utcOffset":"-18000","preferredUsername":"rvzigvdhiv","languages":["en"],"location":{"objectType":"place","displayName":"ATL"},"favoritesCount":4316},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for iPhone","link":"http://twitter.com/download/iphone"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/rvzigvdhiv/statuses/413500803593547776","body":"@thellymon haha aight homie I'll let you know","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500803593547776","summary":"@thellymon haha aight homie I'll let you know","link":"httpee://twitter.com/rvzigvdhiv/statuses/413500803593547776","postedTime":"2013-12-19T02:47:28.000Z"},"inReplyTo":{"link":"httpee://twitter.com/thellymon/statuses/413500370695229441"},"favoritesCount":0,"twitter_entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[{"screen_name":"thellymon","name":"","id":920010534,"id_str":"920010534","indices":[0,10]}]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"language":{"value":"en"},"profileLocations":[{"objectType":"place","geo":{"type":"point","coordinates":[-84.38798,33.749]},"address":{"country":"United States","countryCode":"US","locality":"Atlanta","region":"Georgia","subRegion":"Fulton County"},"displayName":"Atlanta, Georgia, United States"}]}}
{"id":"tag:search.twitter.com,2005:413500803597758464","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:394373858","link":"httpee://www.twitter.com/Carly_Horse12","displayName":"Carly Sawyer","postedTime":"2011-10-19T23:56:56.000Z","image":"httpes://pbs.twimg.com/profile_images/378800000497869250/84266ccaf047be0cfbd8aeb73fe88544_normal.jpeg","summary":"Lindy Hopper. Theatre geek. Biology nerd. Christ follower. Creation lover. Dream chaser.","links":[{"href":null,"rel":"me"}],"friendsCount":398,"followersCount":197,"listedCount":1,"statusesCount":3220,"twitterTimeZone":"Quito","verified":false,"utcOffset":"-18000","preferredUsername":"Carly_Horse12","languages":["en"],"location":{"objectType":"place","displayName":"Charlottesville, VA"},"favoritesCount":662},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for iPhone","link":"httpee://twitter.com/download/iphone"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/Carly_Horse12/statuses/413500803597758464","body":"And this concludes the yearly screening of \"It's A Wonder Life\" in it's usual fashion with Mom and me in shambles #tears","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500803597758464","summary":"And this concludes the yearly screening of \"It's A Wonder Life\" in it's usual fashion with Mom and me in shambles #tears","link":"httpee://twitter.com/Carly_Horse12/statuses/413500803597758464","postedTime":"2013-12-19T02:47:28.000Z"},"favoritesCount":0,"twitter_entities":{"hashtags":[{"text":"tears","indices":[114,120]}],"symbols":[],"urls":[],"user_mentions":[]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"language":{"value":"en"},"profileLocations":[{"objectType":"place","geo":{"type":"point","coordinates":[-78.47668,38.02931]},"address":{"country":"United States","countryCode":"US","locality":"Charlottesville","region":"Virginia","subRegion":"City of Charlottesville"},"displayName":"Charlottesville, Virginia, United States"}]}}
I'm using the following R script:
library(rjson)
library(RCurl)
library(plyr)
raw_data<-('*filepath*/JSON test.json')
data<-fromJSON(paste(readLines(raw_data),collapse=""))
data
tweets<-data$body
tweets
which produces the following result - I only get the data for the first tweet
data<-fromJSON(paste(readLines(raw_data),collapse=""))
data
$id
[1] "tag:search.twitter.com,2005:413500801899044864"
$objectType
[1] "activity"
$actor
$actor$objectType
[1] "person"
$actor$id
[1] "id:twitter.com:860787127"
$actor$link
[1] "httpee://www.twitter.com/JoeGoodman11"
$actor$displayName
[1] "Joe Goodman"
$actor$postedTime
[1] "2012-10-04T03:18:54.000Z"
$actor$image
[1] "httpes://pbs.twimg.com/profile_images/3781305408/372be07ac2b312d35e1426b264891c4f_normal.jpeg"
$actor$summary
NULL
$actor$links
$actor$links[[1]]
$actor$links[[1]]$href
NULL
$actor$links[[1]]$rel
[1] "me"
$actor$friendsCount
[1] 21
$actor$followersCount
[1] 18
$actor$listedCount
[1] 0
$actor$statusesCount
[1] 177
$actor$twitterTimeZone
NULL
$actor$verified
[1] FALSE
$actor$utcOffset
NULL
$actor$preferredUsername
[1] "JoeGoodman11"
$actor$languages
[1] "en"
$actor$favoritesCount
[1] 286
$verb
[1] "post"
$postedTime
[1] "2013-12-19T02:47:28.000Z"
$generator
$generator$displayName
[1] "Twitter for Android"
$generator$link
[1] "httpee://twitter.com/download/android"
$provider
$provider$objectType
[1] "service"
$provider$displayName
[1] "Twitter"
$provider$link
[1] "httpee://www.twitter.com"
$link
[1] "httpee://twitter.com/JoeGoodman11/statuses/413500801899044864"
$body
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"
$object
$object$objectType
[1] "note"
$object$id
[1] "object:search.twitter.com,2005:413500801899044864"
$object$summary
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"
$object$link
[1] "httpee://twitter.com/JoeGoodman11/statuses/413500801899044864"
$object$postedTime
[1] "2013-12-19T02:47:28.000Z"
$favoritesCount
[1] 0
$location
$location$objectType
[1] "place"
$location$displayName
[1] "Lowell, MA"
$location$name
[1] "Lowell"
$location$country_code
[1] "United States"
$location$twitter_country_code
[1] "US"
$location$link
[1] "httpes://api.twitter.com/1.1/geo/id/d6539f049c4d05e8.json"
$location$geo
$location$geo$type
[1] "Polygon"
$location$geo$coordinates
$location$geo$coordinates[[1]]
$location$geo$coordinates[[1]][[1]]
[1] -71.38249 42.60719
$location$geo$coordinates[[1]][[2]]
[1] -71.38249 42.66676
$location$geo$coordinates[[1]][[3]]
[1] -71.27123 42.66676
$location$geo$coordinates[[1]][[4]]
[1] -71.27123 42.60719
$geo
$geo$type
[1] "Point"
$geo$coordinates
[1] 42.64284 -71.33654
$twitter_entities
$twitter_entities$hashtags
list()
$twitter_entities$symbols
list()
$twitter_entities$urls
list()
$twitter_entities$user_mentions
list()
$twitter_entities$media
$twitter_entities$media[[1]]
$twitter_entities$media[[1]]$id
[1] 4.135008e+17
$twitter_entities$media[[1]]$id_str
[1] "413500801395736576"
$twitter_entities$media[[1]]$indices
[1] 33 55
$twitter_entities$media[[1]]$media_url
[1] "httpee://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg"
$twitter_entities$media[[1]]$media_url_https
[1] "httpes://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg"
$twitter_entities$media[[1]]$url
[1] "httpee://t.co/0EumsvUCuI"
$twitter_entities$media[[1]]$display_url
[1] "pic.twitter.com/0EumsvUCuI"
$twitter_entities$media[[1]]$expanded_url
[1] "httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1"
$twitter_entities$media[[1]]$type
[1] "photo"
$twitter_entities$media[[1]]$sizes
$twitter_entities$media[[1]]$sizes$medium
$twitter_entities$media[[1]]$sizes$medium$w
[1] 600
$twitter_entities$media[[1]]$sizes$medium$h
[1] 339
$twitter_entities$media[[1]]$sizes$medium$resize
[1] "fit"
$twitter_entities$media[[1]]$sizes$thumb
$twitter_entities$media[[1]]$sizes$thumb$w
[1] 150
$twitter_entities$media[[1]]$sizes$thumb$h
[1] 150
$twitter_entities$media[[1]]$sizes$thumb$resize
[1] "crop"
$twitter_entities$media[[1]]$sizes$small
$twitter_entities$media[[1]]$sizes$small$w
[1] 340
$twitter_entities$media[[1]]$sizes$small$h
[1] 192
$twitter_entities$media[[1]]$sizes$small$resize
[1] "fit"
$twitter_entities$media[[1]]$sizes$large
$twitter_entities$media[[1]]$sizes$large$w
[1] 1023
$twitter_entities$media[[1]]$sizes$large$h
[1] 579
$twitter_entities$media[[1]]$sizes$large$resize
[1] "fit"
$twitter_filter_level
[1] "medium"
$twitter_lang
[1] "en"
$retweetCount
[1] 0
$gnip
$gnip$urls
$gnip$urls[[1]]
$gnip$urls[[1]]$url
[1] "httpee://t.co/0EumsvUCuI"
$gnip$urls[[1]]$expanded_url
[1] "httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1"
$gnip$urls[[1]]$expanded_status
[1] 200
$gnip$language
$gnip$language$value
[1] "en"
and
tweets<-data$body
tweets
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"
The aim is for tweets to show the body field for all 500 tweets. Any help very gratefully received!
Your paste
call is just concatenating the individual lines without inserting the correct json separators. If you have something like
data <- fromJSON(sprintf("[%s]", paste(readLines(raw_data),collapse=",")))
then individual lines will get separated by a comma, and the whole thing will get wrapped in json's square-bracket notation for an array of objects. You can then extract a top-level property from each element of the data-array as
bodies <- sapply(data, "[[", "body")