pythonjsonapiopencore

Check response using urllib2


I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:

{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}

which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?

This is my code

import urllib2
import json,os

f = open('codes','r')
for line in f.readlines():
   id = line.strip('\n')
   url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26&current_status=Active&page={1}?api_token=ab123cd45' 
   i = 0
   directory = id
   os.makedirs(directory)
   while True:
      i += 1
      req = urllib2.Request(url.format(id, i))
      print url.format(id,i)
      try:
        response = urllib2.urlopen(url.format(id, i))
      except urllib2.HTTPError, e:
        break
      content = response.read()
      fo = str(i) + '.json'    
      OUTFILE = os.path.join(directory, fo)
      with open(OUTFILE, 'w') as f:
        f.write(content)

Solution

  • Interpret the response you get back (you already know it's json) and check if the data you want is there.

    ...
    content = response.read()
    data = json.loads(content)
    if not data.get('results', {}).get('companies'):
        break
    ...
    

    Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.

    import json
    import os
    from time import sleep
    import requests
    
    url = 'http://api.opencorporates.com/v0.2/companies/search'
    token = 'ab123cd45'
    rate = 20  # seconds to wait after rate limited
    
    with open('codes') as f:
        codes = [l.strip('\n') for l in f]
    
    
    def get_page(code, page, **kwargs):
        params = {
            # 'api_token': token,
            'jurisdiction_code': code,
            'page': page,
        }
        params.update(kwargs)
    
        while True:
            r = requests.get(url, params=params)
    
            try:
                data = r.json()
            except ValueError:
                return None
    
            if 'error' in data:
                print data['error']['message']
                sleep(rate)
                continue
    
            return data['results']
    
    
    def dump_page(code, page, data):
        with open(os.path.join(code, str(page) + '.json'), 'w') as f:
            json.dump(data, f)
    
    
    for code in codes:
        try:
            os.makedirs(code)
        except os.error:
            pass
    
        data = get_page(code, 1)
        if data is None:
            continue
    
        dump_page(code, 1, data['companies'])
    
        for page in xrange(1, int(data.get('total_pages', 1))):
            data = get_page(code, page)
            if data is None:
                break
    
            dump_page(code, page, data['companies'])