I am trying to get a report for 3 months, for this, I need to make multiple requests and append the results to a list because API only returns 100,000
rows per request. There is a variable returned from the API named nextPageToken
which I need to pass into the next query to get the next 100,000
rows of the report. I am having a difficult time trying to do this.
Here is my code:
def initialize_analyticsreporting():
'''Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
'''
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
list = []
def get_report(analytics, pageTokenVariable):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'pageSize': 100000,
'dateRanges': [{'startDate': '90daysAgo', 'endDate': 'yesterday'}],
'metrics': [{'expression': 'ga:adClicks'}, {'expression': 'ga:impressions'}, {'expression': 'ga:adCost'}, {'expression': 'ga:CTR'}, {'expression': 'ga:CPC'}, {'expression': 'ga:costPerTransaction'}, {'expression': 'ga:transactions'}, {'expression': 'ga:transactionsPerSession'}, {'expression': 'ga:pageviews'}, {'expression': 'ga:timeOnPage'}],
"pageToken": pageTokenVariable,
'dimensions': [{'name': 'ga:adMatchedQuery'}, {'name': 'ga:campaign'}, {'name': 'ga:adGroup'}, {'name': 'ga:adwordsCustomerID'}, {'name': 'ga:date'}],
'orderBys': [{'fieldName': 'ga:impressions', 'sortOrder': 'DESCENDING'}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adwordsCustomerID',
'operator': 'EXACT',
'expressions': 'abc',
'not': 'True'
}]
}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adMatchedQuery',
'operator': 'EXACT',
'expressions': '(not set)',
'not': 'True'
}]
}]
}]
}
).execute()
analytics = initialize_analyticsreporting()
response = get_report(analytics, "0")
for report in response.get('reports', []):
pagetoken = report.get('nextPageToken', None)
print(pagetoken)
#------printing the pagetoken here returns `100,000` which is expected
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
# Append that data to a list as a dictionary
# pagination function
while pagetoken: # This says while there is info in the nextPageToken get the data, process it and add to the list
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0]['nextPageToken']
print(pagetoken)
#------printing the pagetoken here returns `200,000` as is expected but the data being pulled is the same as for the first batch and so on. While in the loop the pagetoken is being incremented but it does not retrieve new data
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
df = pd.DataFrame(list)
print(df) # Append that data to a list as a dictionary
df.to_csv('full_dataset.csv', encoding="utf-8", index=False)
Where is my mistake trying to pass the pagetoken?
So you're updating the pagetoken in pagetoken = response['reports'][0]['nextPageToken']
but shouldn't you also update rows
in the while loop with new data?
Something like this.
while pagetoken:
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0].get('nextPageToken')
for report in reponse.get('reports', []):
rows = report.get('data', {}).get('rows', [])
for row in rows: