I am trying to make this https://ppolv.wordpress.com/2008/05/09/fun-with-mochiwebs-html-parser-and-xpath/ algorithm work. So everything is compiling and working pretty good (I am getting the size of html page) but:
What I expected:
Size of html
Size of images
Size of scripts
What I get:
Size of html
Size of images equal to zero no matter what
Size of scripts equal to zero no matter what
I was trying to find the error or something that I missed for hours but I have no idea what is wrong. Code:
-module(test).
-author("Hubert").
%% API
-export([printing/4]).
-export([page_info/1]).
-export([got_page_info/3]).
-export([content_length/1]).
-export([spawn_workers/3]).
-export([get_info/2]).
-export([get_url_context/1]).
-export([wait_for_responses/2]).
%declaring record that will hold number of images, css and scripts
-record(state, {page,timer,errors,img,css,script}).
page_info(URL) ->
inets:start(),
case httpc:request(URL) of
{ok,{_,Headers,Body}} ->
got_page_info(URL,content_length(Headers),Body);
{error,Reason} ->
{error,Reason}
end.
got_page_info(URLpassed, PageSize,Body) ->
%getting the parsed version of website
Tree = mochiweb_html:parse(Body),
%particular files being listed and removing duplicates
Imgs = rDup(mochiweb_xpath:execute("//img/@src",Tree)),
%css does not work, do not know why
%Css = rDup(mochiweb_xpath:execute("//link[@rel=’stylesheet’]/@href",Tree)),
Scripts = rDup(mochiweb_xpath:execute("//script/@src",Tree)),
%preapring URL
URL = get_url_context(URLpassed),
spawn_workers(URL,img,lists:map(fun binary_to_list/1,Imgs)),
spawn_workers(URL,script,lists:map(fun binary_to_list/1,Scripts)),
%Starts a timer which will send the message Msg to Dest after Time milliseconds.
TRef = erlang:send_after(10000,self(),timeout),
State = #state{page=PageSize,
timer=TRef,
errors=[],
img=0,
css=0,
script=0},
%number of elements -> so number of responses we should wait for
wait_for_responses(State,length(Imgs) + length(Scripts)),
{ok}.
content_length(Headers) ->
%proplists:get_value(Key,List,Default)
%returns the length of the content
list_to_integer(proplists:get_value("content-length",Headers,"0")).
%function that removes dulpicate
rDup(L) ->
sets:to_list(sets:from_list(L)).
%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
lists:foreach(fun (Url) -> spawn( fun () ->
self() ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).
get_url_context(URL) ->
{ok,{http,_,Root,_Port,Path,_Query}} = http_uri:parse(URL),
Ctx = string:sub_string(Path,1, string:rstr(Path,"/")),
{"http://"++Root,Ctx}. %% gib my url with context
get_info(URlctx,Url) ->
FullURL = full_url(URlctx,Url),
case httpc:request(head,{FullURL,[]},[],[]) of
{ok, {_,Headers,_Body}} ->
{ok,content_length(Headers)};
{error,Reason} ->
{error,Reason}
end.
%FULL URL FUNCTIONS
%% abs url inside the same server ej: /img/image.png
full_url({Root,_Context},ComponentUrl=[$/|_]) ->
Root ++ ComponentUrl;
%% full url ej: http://other.com/img.png
full_url({_Root,_Context},ComponentUrl="http://"++_) ->
ComponentUrl;
% everything else is considerer a relative path.. obviously its wrong (../img)
full_url({Root,Context},ComponentUrl) ->
Root ++ Context ++ "/" ++ ComponentUrl.
%collect infos recieved from wait_for_resposnses and add them to proper field of State
collect_info(State = #state{css=Css},css,_URL,{ok,Info}) ->
State#state{css = Css + Info};
collect_info(State = #state{img=Img},img,_URL,{ok,Info}) ->
State#state{img = Img + Info};
collect_info(State = #state{script=Script},script,_URL,{ok,Info}) ->
State#state{script = Script + Info};
collect_info(State = #state{errors=Errors},_Type,URL,{error,Reason}) ->
State#state{errors=[{URL,Reason}|Errors]}.
%messages from workers
wait_for_responses(State,0) ->
finalize(State,0);
wait_for_responses(State,Counter) ->
receive
{component,Type,URL,Info} ->
wait_for_responses(collect_info(State,Type,URL,Info),Counter - 1);
timeout -> finalize(State,Counter)
end.
%prepares variables for printing
finalize(State,Left) ->
PageSize = State#state.page,
ImgSize = State#state.img,
CssSize = State#state.css, %maybe one day will work
ScriptSize = State#state.script,
Errors = State#state.errors,
TRef = State#state.timer,
erlang:cancel_timer(TRef),
printing(PageSize,ImgSize,CssSize,ScriptSize).
printing(PageSize,ImgSize,CssSize,ScriptSize)->
io:format("html size: ~.2fkb~n",[PageSize/1024]),
io:format("images size: ~.2fkb~n",[ImgSize/1024]),
io:format("script size: ~.2fkb~n",[ScriptSize/1024]),
% io:format("stylesheet size: ~.2fkb~n",[CssSize/1024]),
{ok}.
The problem is in the function:
spawn_workers(URLctx,Type,URLs) ->
lists:foreach(fun (Url) -> spawn( fun () ->
self() ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).
self() is evaluated in the spawned process so it sends the response to itself. Assign self to a variable prior to spawn the process:
spawn_workers(URLctx,Type,URLs) ->
Pid = self(),
lists:foreach(fun (Url) -> spawn( fun () ->
Pid ! {component, Type,Url,get_info(URLctx,Url)}
end)
end, URLs).