Wrote code to scrape a website which lists daycares. Code able to get the relevant information one first of 100+ pages. No clue how to accommodate code to scrape the site past page 1. Each page lists only 25 daycares and there are 150+ pages to scrape. I read something about looking for Post or Get methods that are referenced when clicking on page numbers, but wasn't able to fix my code using that information (believe it was a user named "manhattan" who shared a solution.) Here's a copy of the current code. Appreciate some help:
import requests, bs4, re
cc=requests.get('https://data.nj.gov/childcare_results?center=&phone=&capacity_low=&capacity_high=&county=&city=&addr1=&zip=')
cc1=bs4.BeautifulSoup(cc.text,"html.parser")
#Trying to get all instances of <a href ="[website for inspection report for each daycare]> <a>, but line below returns many more instances of href. Will have to fix that.
#Code below does that for the first page in the above website, but no idea how to make it do the same for the remaining 150+ pages of listings
cc2 = cc1.select('div a')
print(len(cc2))
print (cc2)
This is one of the easiest way you can automate the whole process except for downloading the dataset. To get the link you need to take a good look at xhr
section under network tab
in dev tools. By changing the last number (25
) of this URL https://data.nj.gov/views/INLINE/rows.json?accessType=WEBSITE&method=getByIds&asHashes=true&start=0&length=25
you can get URLs up to the highest number you set. Give this a shot:
import requests
from bs4 import BeautifulSoup
data = {"columns":[{"id":346250849,"name":"COUNTY","dataTypeName":"text","description":"County that Childcare Center is located in.","fieldName":"county","position":1,"tableColumnId":21926709,"width":86,"format":{},"metadata":{}},{"id":346250850,"name":"CENTER","dataTypeName":"text","description":"Childcare Center name.","fieldName":"center","position":2,"tableColumnId":21926710,"width":342,"format":{},"metadata":{}},{"id":346250851,"name":"PHONE","dataTypeName":"phone","description":"Telephone number of Childcare Center.","fieldName":"phone","position":3,"tableColumnId":21926711,"width":142,"format":{"align":"left"},"metadata":{}},{"id":346250852,"name":"AGES","dataTypeName":"text","description":"Ages of children that Childcare Center provides services to.","fieldName":"ages","position":4,"tableColumnId":21926712,"width":80,"format":{"align":"center"},"metadata":{}},{"id":346250853,"name":"CAPACITY","dataTypeName":"number","description":"Capacity of Childcare Center.","fieldName":"capacity","position":5,"tableColumnId":21926713,"width":80,"format":{"precisionStyle":"standard","noCommas":"false","align":"center"},"metadata":{}},{"id":346250854,"name":"CITY","dataTypeName":"text","description":"City that Childcare Center is located in.","fieldName":"city","position":6,"tableColumnId":21926714,"width":183,"format":{},"metadata":{}},{"id":346250855,"name":"ADDR1","dataTypeName":"text","description":"Street Address of Childcare Center.","fieldName":"addr1","position":7,"tableColumnId":21926715,"width":232,"format":{},"metadata":{}},{"id":346250856,"name":"ADDR2","dataTypeName":"text","description":"Street Address of Childcare Center.","fieldName":"addr2","position":8,"tableColumnId":21926716,"width":241,"format":{},"metadata":{}},{"id":346250857,"name":"ZIP","dataTypeName":"text","description":"Zip code of Childcare Center location.","fieldName":"zip","position":9,"tableColumnId":21926717,"width":73,"format":{"precisionStyle":"standard","noCommas":"false","align":"right"},"metadata":{}},{"id":346250858,"name":"INSPECTIONS","dataTypeName":"dataset_link","description":"Link to webpage with inspection reports for the center","fieldName":"inspections","position":10,"tableColumnId":31789576,"width":100,"format":{},"metadata":{}}],"id":"cru5-4rmm","name":"Licensed Child Care Centers","attribution":"New Jersey Department of Children and Families","category":"Public Safety","description":"Searchable listing of all Licensed Child Care Centers and Inspection Reports.","displayType":"table","hideFromCatalog":False,"hideFromDataJson":False,"iconUrl":"fileId:_Judwh1-EhTD-ocxNjEi_f1JVO4iylkkrFqVjAbCZ6A","publicationAppendEnabled":False,"metadata":{"rdfSubject":"0","attachments":[{"filename":"Licensed_Child_Care_Centers_Explorer_Metadata_Description.docx","assetId":"VDUg-zFJ0y2Fsti8oTr4gkdMiqvg0a2JTk7Co7uxYRg","blobId":"","name":"Licensed_Child_Care_Centers_Explorer_Metadata_Description.docx"}],"custom_fields":{"Asset Details":{"Asset Posting Frequency":"Monthly","Geographic Coverage":"State","Asset Collection Frequency":"Monthly","Asset End Date":"Ongoing","Asset Provider Organization":"Children & Families","Asset Start Date":"1/15/15"},"Common Core":{"Contact Email":"ContactDataNJ@tech.nj.gov","Contact Name":"Data NJ Team","Publisher":"NJ OIT Open Data Center"}},"availableDisplayTypes":["table","fatrow","page"],"rowLabel":"Row","renderTypeConfig":{"visible":{"table":True}},"jsonQuery":{"where":{"operator":"AND","children":[{"operator":"CONTAINS","columnFieldName":"center","value":""},{"operator":"CONTAINS","columnFieldName":"phone","value":""},{"operator":"GREATER_THAN_OR_EQUALS","columnFieldName":"capacity","value":"0"},{"operator":"LESS_THAN_OR_EQUALS","columnFieldName":"capacity","value":"9999999"},{"operator":"CONTAINS","columnFieldName":"county","value":""},{"operator":"CONTAINS","columnFieldName":"city","value":""},{"operator":"CONTAINS","columnFieldName":"addr1","value":""},{"operator":"CONTAINS","columnFieldName":"zip","value":""},{"operator":"CONTAINS","columnFieldName":"zip","value":""}]},"order":[{"columnFieldName":"center","ascending":True}]}},"query":{"filterCondition":{"value":"AND","children":[{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"center"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"phone"},{"value":"","type":"literal"}],"type":"operator"},{"value":"GREATER_THAN_OR_EQUALS","children":[{"type":"column","columnFieldName":"capacity"},{"value":"0","type":"literal"}],"type":"operator"},{"value":"LESS_THAN_OR_EQUALS","children":[{"type":"column","columnFieldName":"capacity"},{"value":"9999999","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"county"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"city"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"addr1"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"zip"},{"value":"","type":"literal"}],"type":"operator"},{"value":"CONTAINS","children":[{"type":"column","columnFieldName":"zip"},{"value":"","type":"literal"}],"type":"operator"}],"type":"operator"},"orderBys":[{"expression":{"type":"column","columnId":346250850},"ascending":True}]},"tags":["ool","child care","child care centers","dcf"],"flags":["default","restorable"],"originalViewId":"cru5-4rmm","displayFormat":{}}
res = requests.post("https://data.nj.gov/views/INLINE/rows.json?accessType=WEBSITE&method=getByIds&asHashes=true&start=0&length=25",json=data)
items = res.json()
for item in items:
links = BeautifulSoup(item['346250858'],"lxml")
link = links.find("a")['href']
print(link)