javascriptphantomjshar

Using phantom.js to generate multiple HAR files


I'm using the code from netsniff.js to generate a har file and I want to improve it to generate a har file from multiple links given in an array (named links in my below code).

There is another question here Using Multiple page.open in Single Script that might help me, but I have no idea how to implement the given solution in my code..

Below is my code (it logs FAIL to load the address in the output file if the links array contain more than one item):

"use strict";
if (!Date.prototype.toISOString) {
    Date.prototype.toISOString = function () {
        function pad(n) { return n < 10 ? '0' + n : n; }
        function ms(n) { return n < 10 ? '00'+ n : n < 100 ? '0' + n : n }
        return this.getFullYear() + '-' +
            pad(this.getMonth() + 1) + '-' +
            pad(this.getDate()) + 'T' +
            pad(this.getHours()) + ':' +
            pad(this.getMinutes()) + ':' +
            pad(this.getSeconds()) + '.' +
            ms(this.getMilliseconds()) + 'Z';
    }
}
var entries = [];
function createHAR(address, title, startTime, resources)
{
    resources.forEach(function (resource) {
        var request = resource.request,
            startReply = resource.startReply,
            endReply = resource.endReply;

        if (!request || !startReply || !endReply) {
            return;
        }

        // Exclude Data URI from HAR file because
        // they aren't included in specification
        if (request.url.match(/(^data:image\/.*)/i)) {
            return;
        }

        entries.push({
            startedDateTime: request.time.toISOString(),
            time: endReply.time - request.time,
            request: {
                method: request.method,
                url: request.url,
                httpVersion: "HTTP/1.1",
                cookies: [],
                headers: request.headers,
                queryString: [],
                headersSize: -1,
                bodySize: -1
            },
            response: {
                status: endReply.status,
                statusText: endReply.statusText,
                httpVersion: "HTTP/1.1",
                cookies: [],
                headers: endReply.headers,
                redirectURL: "",
                headersSize: -1,
                bodySize: startReply.bodySize,
                content: {
                    size: startReply.bodySize,
                    mimeType: endReply.contentType
                }
            },
            cache: {},
            timings: {
                blocked: 0,
                dns: -1,
                connect: -1,
                send: 0,
                wait: startReply.time - request.time,
                receive: endReply.time - startReply.time,
                ssl: -1
            },
            pageref: address
        });
    });

    return {
        log: {
            version: '1.2',
            creator: {
                name: "PhantomJS",
                version: phantom.version.major + '.' + phantom.version.minor +
                    '.' + phantom.version.patch
            },
            pages: [{
                startedDateTime: startTime.toISOString(),
                id: address,
                title: title,
                pageTimings: {
                    onLoad: page.endTime - page.startTime
                }
            }],
            entries: entries
        }
    };
}
var page = require('webpage').create()
var fs = require('fs');
var count = 0;
function processSites(links)
{
    page.address = links.pop();
    var path = 'file' + count + '.har';
    page.resources = [];
    console.log("page resources:", page.resources)
    count = count + 1;
    page.onLoadStarted = function () {
        page.startTime = new Date();
    };
    page.onResourceRequested = function (req) {
        page.resources[req.id] = {
            request: req,
            startReply: null,
            endReply: null
        };
    };

    page.onResourceReceived = function (res) {
        if (res.stage === 'start') {
            page.resources[res.id].startReply = res;
        }
        if (res.stage === 'end') {
            page.resources[res.id].endReply = res;
        }
    };

    page.open(page.address, function (status) {
        var har;
        setTimeout(function () {
            if (status !== 'success') {
                console.log('FAIL to load the address');
                phantom.exit(1);
            } else {
                page.endTime = new Date();
                page.title = page.evaluate(function () {
                    return document.title;
                });
                entries = [];
                har = createHAR(page.address, page.title, page.startTime, page.resources);
                // console.log(JSON.stringify(har, undefined, 4));
                fs.write(path, JSON.stringify(har), 'w');

                if(links.length > 0)
                {
                    processSites(links);
                }
                else
                {
                    phantom.exit();
                }
            }
        }, 10000);
    });

}

var links = ["http://stackoverflow.com", "http://marvel.com"];

processSites(links);

Update:
The above code generate two har files file1.har and file2.har, but the second har file also contains the har code generated from both links, and it should only have the har code for the first link...

Fixed this by setting var har = " "


Solution

  • You can't iterate opening pages in PhantomJS in a simple loop because page.open method is asynchronous. It doesn't wait for first site to be processed, opening the second right away.

    I've rewritten your script to use recursion: next site will be opened only after the current is processed. (Note: if any of the sites in queue will fail to load the whole process will halt, but you can easily rewrite the script to avoid that).

    if (!Date.prototype.toISOString) {
        Date.prototype.toISOString = function () {
            // ...
        }
    }
    
    var entries = [];
    
    function createHAR(address, title, startTime, resources)
    {
        // ...
    }
    
    var page = require('webpage').create()
    
    function processSites(links)
    {
        page.address = links.pop();
    
        console.log("PAGE ADDRESS: ", page.address);
        page.resources = [];
    
        page.onLoadStarted = function () {
            page.startTime = new Date();
        };
        page.onResourceRequested = function (req) {
            page.resources[req.id] = {
                request: req,
                startReply: null,
                endReply: null
            };
        };
    
        page.onResourceReceived = function (res) {
            if (res.stage === 'start') {
                page.resources[res.id].startReply = res;
            }
            if (res.stage === 'end') {
                page.resources[res.id].endReply = res;
            }
        };
    
        page.open(page.address, function (status) {
            var har;
            setTimeout(function () {
                if (status !== 'success') {
                    console.log('FAIL to load the address');
                    phantom.exit(1);
                } else {
                    page.endTime = new Date();
                    page.title = page.evaluate(function () {
                        return document.title;
                    });
                    har = createHAR(page.address, page.title, page.startTime, page.resources);
                    console.log(JSON.stringify(har, undefined, 4));
    
                    if(links.length > 0)
                    {
                        processSites(links);
                    }
                    else
                    {
                        phantom.exit();
                    }
                }
            }, 10000);
        });
    
    }
    
    var links = ["http://edition.cnn.com", "http://stackoverflow.com"];
    
    processSites(links);