jsonrtwitterrjsonrjsonio

fromJSON only reads first line in R


I've got a sample JSON file with about 500 tweets which I'd like to get into a dataframe.

The first three tweets from the JSON file are as follows (urls have been changed deliberately to fit within stackoverflow rules on links):

{"id":"tag:search.twitter.com,2005:413500801899044864","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:860787127","link":"httpee://www.twitter.com/JoeGoodman11","displayName":"Joe Goodman","postedTime":"2012-10-04T03:18:54.000Z","image":"httpes://pbs.twimg.com/profile_images/3781305408/372be07ac2b312d35e1426b264891c4f_normal.jpeg","summary":null,"links":[{"href":null,"rel":"me"}],"friendsCount":21,"followersCount":18,"listedCount":0,"statusesCount":177,"twitterTimeZone":null,"verified":false,"utcOffset":null,"preferredUsername":"JoeGoodman11","languages":["en"],"favoritesCount":286},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for Android","link":"httpee://twitter.com/download/android"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/JoeGoodman11/statuses/413500801899044864","body":"Hard at work studying for finals httpee://t.co/0EumsvUCuI","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500801899044864","summary":"Hard at work studying for finals httpee://t.co/0EumsvUCuI","link":"httpee://twitter.com/JoeGoodman11/statuses/413500801899044864","postedTime":"2013-12-19T02:47:28.000Z"},"favoritesCount":0,"location":{"objectType":"place","displayName":"Lowell, MA","name":"Lowell","country_code":"United States","twitter_country_code":"US","link":"httpes://api.twitter.com/1.1/geo/id/d6539f049c4d05e8.json","geo":{"type":"Polygon","coordinates":[[[-71.382491,42.607189],[-71.382491,42.66676],[-71.271231,42.66676],[-71.271231,42.607189]]]}},"geo":{"type":"Point","coordinates":[42.6428357,-71.33654]},"twitter_entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[],"media":[{"id":413500801395736576,"id_str":"413500801395736576","indices":[33,55],"media_url":"httpee://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg","media_url_https":"httpes://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg","url":"httpee://t.co/0EumsvUCuI","display_url":"pic.twitter.com/0EumsvUCuI","expanded_url":"httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1","type":"photo","sizes":{"medium":{"w":600,"h":339,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":340,"h":192,"resize":"fit"},"large":{"w":1023,"h":579,"resize":"fit"}}}]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"urls":[{"url":"httpee://t.co/0EumsvUCuI","expanded_url":"httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1","expanded_status":200}],"language":{"value":"en"}}}
{"id":"tag:search.twitter.com,2005:413500803593547776","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:168228121","link":"httpee://www.twitter.com/rvzigvdhiv","displayName":"Razi الرازي Gadhia","postedTime":"2010-07-18T19:28:45.000Z","image":"httpes://pbs.twimg.com/profile_images/412269827399495680/44JZWZPz_normal.jpeg","summary":"Why so serious? \n#2005spellingbeechamp \n#wood","links":[{"href":null,"rel":"me"}],"friendsCount":196,"followersCount":300,"listedCount":0,"statusesCount":4236,"twitterTimeZone":"Eastern Time (US & Canada)","verified":false,"utcOffset":"-18000","preferredUsername":"rvzigvdhiv","languages":["en"],"location":{"objectType":"place","displayName":"ATL"},"favoritesCount":4316},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for iPhone","link":"http://twitter.com/download/iphone"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/rvzigvdhiv/statuses/413500803593547776","body":"@thellymon haha aight homie I'll let you know","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500803593547776","summary":"@thellymon haha aight homie I'll let you know","link":"httpee://twitter.com/rvzigvdhiv/statuses/413500803593547776","postedTime":"2013-12-19T02:47:28.000Z"},"inReplyTo":{"link":"httpee://twitter.com/thellymon/statuses/413500370695229441"},"favoritesCount":0,"twitter_entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[{"screen_name":"thellymon","name":"","id":920010534,"id_str":"920010534","indices":[0,10]}]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"language":{"value":"en"},"profileLocations":[{"objectType":"place","geo":{"type":"point","coordinates":[-84.38798,33.749]},"address":{"country":"United States","countryCode":"US","locality":"Atlanta","region":"Georgia","subRegion":"Fulton County"},"displayName":"Atlanta, Georgia, United States"}]}}
{"id":"tag:search.twitter.com,2005:413500803597758464","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:394373858","link":"httpee://www.twitter.com/Carly_Horse12","displayName":"Carly Sawyer","postedTime":"2011-10-19T23:56:56.000Z","image":"httpes://pbs.twimg.com/profile_images/378800000497869250/84266ccaf047be0cfbd8aeb73fe88544_normal.jpeg","summary":"Lindy Hopper. Theatre geek. Biology nerd. Christ follower. Creation lover. Dream chaser.","links":[{"href":null,"rel":"me"}],"friendsCount":398,"followersCount":197,"listedCount":1,"statusesCount":3220,"twitterTimeZone":"Quito","verified":false,"utcOffset":"-18000","preferredUsername":"Carly_Horse12","languages":["en"],"location":{"objectType":"place","displayName":"Charlottesville, VA"},"favoritesCount":662},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for iPhone","link":"httpee://twitter.com/download/iphone"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/Carly_Horse12/statuses/413500803597758464","body":"And this concludes the yearly screening of \"It's A Wonder Life\" in it's usual fashion with Mom and me in shambles #tears","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500803597758464","summary":"And this concludes the yearly screening of \"It's A Wonder Life\" in it's usual fashion with Mom and me in shambles #tears","link":"httpee://twitter.com/Carly_Horse12/statuses/413500803597758464","postedTime":"2013-12-19T02:47:28.000Z"},"favoritesCount":0,"twitter_entities":{"hashtags":[{"text":"tears","indices":[114,120]}],"symbols":[],"urls":[],"user_mentions":[]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"language":{"value":"en"},"profileLocations":[{"objectType":"place","geo":{"type":"point","coordinates":[-78.47668,38.02931]},"address":{"country":"United States","countryCode":"US","locality":"Charlottesville","region":"Virginia","subRegion":"City of Charlottesville"},"displayName":"Charlottesville, Virginia, United States"}]}}

I'm using the following R script:

library(rjson)
library(RCurl)
library(plyr)
raw_data<-('*filepath*/JSON test.json')
data<-fromJSON(paste(readLines(raw_data),collapse=""))
data
tweets<-data$body
tweets

which produces the following result - I only get the data for the first tweet

data<-fromJSON(paste(readLines(raw_data),collapse=""))
data

$id
[1] "tag:search.twitter.com,2005:413500801899044864"

$objectType
[1] "activity"

$actor
$actor$objectType
[1] "person"

$actor$id
[1] "id:twitter.com:860787127"

$actor$link
[1] "httpee://www.twitter.com/JoeGoodman11"

$actor$displayName
[1] "Joe Goodman"

$actor$postedTime
[1] "2012-10-04T03:18:54.000Z"

$actor$image
[1] "httpes://pbs.twimg.com/profile_images/3781305408/372be07ac2b312d35e1426b264891c4f_normal.jpeg"

$actor$summary
NULL

$actor$links
$actor$links[[1]]
$actor$links[[1]]$href
NULL

$actor$links[[1]]$rel
[1] "me"



$actor$friendsCount
[1] 21

$actor$followersCount
[1] 18

$actor$listedCount
[1] 0

$actor$statusesCount
[1] 177

$actor$twitterTimeZone
NULL

$actor$verified
[1] FALSE

$actor$utcOffset
NULL

$actor$preferredUsername
[1] "JoeGoodman11"

$actor$languages
[1] "en"

$actor$favoritesCount
[1] 286


$verb
[1] "post"

$postedTime
[1] "2013-12-19T02:47:28.000Z"

$generator
$generator$displayName
[1] "Twitter for Android"

$generator$link
[1] "httpee://twitter.com/download/android"


$provider
$provider$objectType
[1] "service"

$provider$displayName
[1] "Twitter"

$provider$link
[1] "httpee://www.twitter.com"


$link
[1] "httpee://twitter.com/JoeGoodman11/statuses/413500801899044864"

$body
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"

$object
$object$objectType
[1] "note"

$object$id
[1] "object:search.twitter.com,2005:413500801899044864"

$object$summary
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"

$object$link
[1] "httpee://twitter.com/JoeGoodman11/statuses/413500801899044864"

$object$postedTime
[1] "2013-12-19T02:47:28.000Z"


$favoritesCount
[1] 0

$location
$location$objectType
[1] "place"

$location$displayName
[1] "Lowell, MA"

$location$name
[1] "Lowell"

$location$country_code
[1] "United States"

$location$twitter_country_code
[1] "US"

$location$link
[1] "httpes://api.twitter.com/1.1/geo/id/d6539f049c4d05e8.json"

$location$geo
$location$geo$type
[1] "Polygon"

$location$geo$coordinates
$location$geo$coordinates[[1]]
$location$geo$coordinates[[1]][[1]]
[1] -71.38249  42.60719

$location$geo$coordinates[[1]][[2]]
[1] -71.38249  42.66676

$location$geo$coordinates[[1]][[3]]
[1] -71.27123  42.66676

$location$geo$coordinates[[1]][[4]]
[1] -71.27123  42.60719





$geo
$geo$type
[1] "Point"

$geo$coordinates
[1]  42.64284 -71.33654


$twitter_entities
$twitter_entities$hashtags
list()

$twitter_entities$symbols
list()

$twitter_entities$urls
list()

$twitter_entities$user_mentions
list()

$twitter_entities$media
$twitter_entities$media[[1]]
$twitter_entities$media[[1]]$id
[1] 4.135008e+17

$twitter_entities$media[[1]]$id_str
[1] "413500801395736576"

$twitter_entities$media[[1]]$indices
[1] 33 55

$twitter_entities$media[[1]]$media_url
[1] "httpee://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg"

$twitter_entities$media[[1]]$media_url_https
[1] "httpes://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg"

$twitter_entities$media[[1]]$url
[1] "httpee://t.co/0EumsvUCuI"

$twitter_entities$media[[1]]$display_url
[1] "pic.twitter.com/0EumsvUCuI"

$twitter_entities$media[[1]]$expanded_url
[1] "httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1"

$twitter_entities$media[[1]]$type
[1] "photo"

$twitter_entities$media[[1]]$sizes
$twitter_entities$media[[1]]$sizes$medium
$twitter_entities$media[[1]]$sizes$medium$w
[1] 600

$twitter_entities$media[[1]]$sizes$medium$h
[1] 339

$twitter_entities$media[[1]]$sizes$medium$resize
[1] "fit"


$twitter_entities$media[[1]]$sizes$thumb
$twitter_entities$media[[1]]$sizes$thumb$w
[1] 150

$twitter_entities$media[[1]]$sizes$thumb$h
[1] 150

$twitter_entities$media[[1]]$sizes$thumb$resize
[1] "crop"


$twitter_entities$media[[1]]$sizes$small
$twitter_entities$media[[1]]$sizes$small$w
[1] 340

$twitter_entities$media[[1]]$sizes$small$h
[1] 192

$twitter_entities$media[[1]]$sizes$small$resize
[1] "fit"


$twitter_entities$media[[1]]$sizes$large
$twitter_entities$media[[1]]$sizes$large$w
[1] 1023

$twitter_entities$media[[1]]$sizes$large$h
[1] 579

$twitter_entities$media[[1]]$sizes$large$resize
[1] "fit"






$twitter_filter_level
[1] "medium"

$twitter_lang
[1] "en"

$retweetCount
[1] 0

$gnip
$gnip$urls
$gnip$urls[[1]]
$gnip$urls[[1]]$url
[1] "httpee://t.co/0EumsvUCuI"

$gnip$urls[[1]]$expanded_url
[1] "httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1"

$gnip$urls[[1]]$expanded_status
[1] 200



$gnip$language
$gnip$language$value
[1] "en"

and

tweets<-data$body
tweets

[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"

The aim is for tweets to show the body field for all 500 tweets. Any help very gratefully received!


Solution

  • Your paste call is just concatenating the individual lines without inserting the correct json separators. If you have something like

    data <- fromJSON(sprintf("[%s]", paste(readLines(raw_data),collapse=",")))
    

    then individual lines will get separated by a comma, and the whole thing will get wrapped in json's square-bracket notation for an array of objects. You can then extract a top-level property from each element of the data-array as

    bodies <- sapply(data, "[[", "body")