I'm using the code from netsniff.js to generate a har file
and I want to improve it to generate a har file from multiple links given in an array (named links
in my below code).
There is another question here Using Multiple page.open in Single Script that might help me, but I have no idea how to implement the given solution in my code..
Below is my code (it logs FAIL to load the address
in the output file if the links
array contain more than one item):
"use strict";
if (!Date.prototype.toISOString) {
Date.prototype.toISOString = function () {
function pad(n) { return n < 10 ? '0' + n : n; }
function ms(n) { return n < 10 ? '00'+ n : n < 100 ? '0' + n : n }
return this.getFullYear() + '-' +
pad(this.getMonth() + 1) + '-' +
pad(this.getDate()) + 'T' +
pad(this.getHours()) + ':' +
pad(this.getMinutes()) + ':' +
pad(this.getSeconds()) + '.' +
ms(this.getMilliseconds()) + 'Z';
}
}
var entries = [];
function createHAR(address, title, startTime, resources)
{
resources.forEach(function (resource) {
var request = resource.request,
startReply = resource.startReply,
endReply = resource.endReply;
if (!request || !startReply || !endReply) {
return;
}
// Exclude Data URI from HAR file because
// they aren't included in specification
if (request.url.match(/(^data:image\/.*)/i)) {
return;
}
entries.push({
startedDateTime: request.time.toISOString(),
time: endReply.time - request.time,
request: {
method: request.method,
url: request.url,
httpVersion: "HTTP/1.1",
cookies: [],
headers: request.headers,
queryString: [],
headersSize: -1,
bodySize: -1
},
response: {
status: endReply.status,
statusText: endReply.statusText,
httpVersion: "HTTP/1.1",
cookies: [],
headers: endReply.headers,
redirectURL: "",
headersSize: -1,
bodySize: startReply.bodySize,
content: {
size: startReply.bodySize,
mimeType: endReply.contentType
}
},
cache: {},
timings: {
blocked: 0,
dns: -1,
connect: -1,
send: 0,
wait: startReply.time - request.time,
receive: endReply.time - startReply.time,
ssl: -1
},
pageref: address
});
});
return {
log: {
version: '1.2',
creator: {
name: "PhantomJS",
version: phantom.version.major + '.' + phantom.version.minor +
'.' + phantom.version.patch
},
pages: [{
startedDateTime: startTime.toISOString(),
id: address,
title: title,
pageTimings: {
onLoad: page.endTime - page.startTime
}
}],
entries: entries
}
};
}
var page = require('webpage').create()
var fs = require('fs');
var count = 0;
function processSites(links)
{
page.address = links.pop();
var path = 'file' + count + '.har';
page.resources = [];
console.log("page resources:", page.resources)
count = count + 1;
page.onLoadStarted = function () {
page.startTime = new Date();
};
page.onResourceRequested = function (req) {
page.resources[req.id] = {
request: req,
startReply: null,
endReply: null
};
};
page.onResourceReceived = function (res) {
if (res.stage === 'start') {
page.resources[res.id].startReply = res;
}
if (res.stage === 'end') {
page.resources[res.id].endReply = res;
}
};
page.open(page.address, function (status) {
var har;
setTimeout(function () {
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit(1);
} else {
page.endTime = new Date();
page.title = page.evaluate(function () {
return document.title;
});
entries = [];
har = createHAR(page.address, page.title, page.startTime, page.resources);
// console.log(JSON.stringify(har, undefined, 4));
fs.write(path, JSON.stringify(har), 'w');
if(links.length > 0)
{
processSites(links);
}
else
{
phantom.exit();
}
}
}, 10000);
});
}
var links = ["http://stackoverflow.com", "http://marvel.com"];
processSites(links);
Update:
The above code generate two har files file1.har and file2.har, but the second har file also contains the har
code generated from both links, and it should only have the har
code for the first link...
Fixed this by setting var har = " "
You can't iterate opening pages in PhantomJS in a simple loop because page.open
method is asynchronous. It doesn't wait for first site to be processed, opening the second right away.
I've rewritten your script to use recursion: next site will be opened only after the current is processed. (Note: if any of the sites in queue will fail to load the whole process will halt, but you can easily rewrite the script to avoid that).
if (!Date.prototype.toISOString) {
Date.prototype.toISOString = function () {
// ...
}
}
var entries = [];
function createHAR(address, title, startTime, resources)
{
// ...
}
var page = require('webpage').create()
function processSites(links)
{
page.address = links.pop();
console.log("PAGE ADDRESS: ", page.address);
page.resources = [];
page.onLoadStarted = function () {
page.startTime = new Date();
};
page.onResourceRequested = function (req) {
page.resources[req.id] = {
request: req,
startReply: null,
endReply: null
};
};
page.onResourceReceived = function (res) {
if (res.stage === 'start') {
page.resources[res.id].startReply = res;
}
if (res.stage === 'end') {
page.resources[res.id].endReply = res;
}
};
page.open(page.address, function (status) {
var har;
setTimeout(function () {
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit(1);
} else {
page.endTime = new Date();
page.title = page.evaluate(function () {
return document.title;
});
har = createHAR(page.address, page.title, page.startTime, page.resources);
console.log(JSON.stringify(har, undefined, 4));
if(links.length > 0)
{
processSites(links);
}
else
{
phantom.exit();
}
}
}, 10000);
});
}
var links = ["http://edition.cnn.com", "http://stackoverflow.com"];
processSites(links);