Hi I am working on a web scraper, first I was trying to scrape using php CURL
, but then I faced a problem that I wasn't able to scrape the sites which loads through AJAX
and then I shifted to 'phantom JS' and 'casper JS`.
Now I have successfully installed the webkit and can scrape the data from any website, but I am unable to save the data for long use in a database. Simply, for later use. What I want to do is, whatever data I have scraped I want to save that to mySql database.
Is there any way I can achieve such functionality? I have tried sending Ajax
request to send the data to the database but failed.
I came up with one another solution for instance, that is when I scrape the data from the specified website, I push the data to an array called data[]
and then I write that data to a .json
file. Where each bunch of data is saved in array of objects form which is get from JSON.stringify(data)
.
Now, I don't know if how can I get that file data and save it in database? Is it possible that, whenever the scraping is finished, right after I grab data from that .json
file and save it to database.
For now just take this code as an example
var casper = require('casper').create();
var file = require('fs');
var data = [];
casper.start('http://casperjs.org/', function() {
data.push(this.getTitle());
file.write("file.json", JSON.stringify(data), "a");
});
casper.run();
Simple solution I found is to make ajax request to the server, inside the evaluate function :
casper.then(function() {
details = this.evaluate(function() {
var elDet = document.getElementsByClassName("job-description-column")[0];
var detLen = elDet.children[2].children[0].children.length;
var details = elDet.children[2].children[0].children;
var linkedData = [];
for (var i = 0; i < detLen; i++) {
if (details[i].nodeName == "H3" && details[i].id != "if-this-sounds-like-you,-apply") {
linkedData.push({
head: details[i].textContent,
description: details[i + 1].textContent,
title: elDet.children[0].children[0].children[0].textContent,
loc: elDet.children[0].children[0].children[1].textContent,
date: elDet.children[0].children[0].children[2].textContent
})
i++;
} else {
linkedData.push({
head: "No Head",
description: details[i].textContent,
title: elDet.children[0].children[0].children[0].textContent,
loc: elDet.children[0].children[0].children[1].textContent,
date: elDet.children[0].children[0].children[2].textContent
})
}
}
var s = JSON.stringify(linkedData);
console.log(linkedData);
$.ajax({
method: "POST",
url: "http://localhost/fiverr/Crawl%20The%20Jobs/modal_scripts.php",
data: "add_jobdets=true&job_details=" + s,
async: false
})
return linkedData;
})
})