I have a problem getting hyperlinks from IHTMLDocument2 in Delphi. For instance, instead of returning the full link "http://ena.ge/explanatory-online", IHTMLDocument2 returns "about:/explanatory-online". The simple substitution of "about" with root URL is not working for all cases.
Here is the code I am using:
procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings; var MyInnerText,MyInnerHTML:widestring);
var
resp: TMemoryStream;
IdHTTP: TidHTTP;
v: Variant;
iDoc: IHTMLDocument2;
links: OleVariant;
MyHyperlink, aHref: string;
i: integer;
begin
resp := TMemoryStream.Create;
IdHTTP := TidHTTP.Create(nil);
iDoc := coHTMLDocument.Create as IHTMLDocument2;
try
IdHTTP.Get(MyURL, resp);
resp.Position := 0;
MyHTML.LoadFromStream(resp,TEncoding.UTF8);
finally
resp.Free;
IdHTTP.Free;
end;
v := VarArrayCreate([0, 0], VarVariant);
v[0] := MyHTML.text;
iDoc.write(PSafeArray(System.TVarData(v).VArray));
iDoc.designMode := 'off';
while iDoc.readyState <> 'complete' do
Application.ProcessMessages;
showmessage(idoc.url);
MyInnerText:=idoc.body.innerText;
MyInnerHTML:=idoc.body.innerHTML;
links := iDoc.all.tags('A');
if links.Length > 0 then
begin
for i := 0 to -1 + links.Length do
begin
aHref := links.Item(i).href;
MyHyperlinks.Add(aHref);
end;
end;
end;
Look at the source of the page and you will see what the links look like, for example: href="/explanatory-online" If you download the IdHttp page, IHTMLDocument2 does not have the original page address. You can use TWebBrowser or manually replace string or use IHTMLDocument4.
Example 1 (TWebBrowser):
procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
var MyInnerText,MyInnerHTML:widestring);
var
Flags: System.OleVariant;
iDoc: IHTMLDocument2;
links: OleVariant;
MyHyperlink, aHref: string;
i: integer;
begin
Flags := Flags or navNoReadFromCache or navNoWriteToCache;
Form1.WebBrowser1.Silent := True;
Form1.WebBrowser1.Navigate(MyURL, Flags);
while Form1.WebBrowser1.ReadyState <> READYSTATE_COMPLETE do
Application.ProcessMessages;
iDoc := Form1.WebBrowser1.Document as IHTMLDocument2;
//showmessage(idoc.url);
MyInnerText:=idoc.body.innerText;
MyInnerHTML:=idoc.body.innerHTML;
links := iDoc.all.tags('A');
if links.Length > 0 then
begin
for i := 0 to -1 + links.Length do
begin
aHref := links.Item(i).href;
MyHyperlinks.Add(aHref);
end;
end;
end;
Example 2 (replace string):
function GetDomain(URL: String): String;
var
Pos1, Pos2: Integer;
begin
Result := '';
URL := Trim(URL);
Pos1 := LastDelimiter('/', URL);
Pos2 := Pos('/', URL, Pos1 + 1);
if (Pos2 = 0) then
Result := URL + '/'
else if (Pos1 > 0) then
Result := Copy(Url, 1, Pos1);
end;
procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
var MyInnerText, MyInnerHTML: WideString);
var
resp: TMemoryStream;
IdHTTP: TidHTTP;
v: Variant;
iDoc: IHTMLDocument2;
links: OleVariant;
MyHyperlink, aHref, Domain: string;
I, J: Integer;
begin
resp := TMemoryStream.Create;
IdHTTP := TidHTTP.Create(nil);
iDoc := coHTMLDocument.Create as IHTMLDocument2;
try
IdHTTP.Get(MyURL, resp);
resp.Position := 0;
MyHTML.LoadFromStream(resp,TEncoding.UTF8);
finally
resp.Free;
IdHTTP.Free;
end;
v := VarArrayCreate([0, 0], VarVariant);
v[0] := MyHTML.text;
iDoc.write(PSafeArray(System.TVarData(v).VArray));
iDoc.designMode := 'off';
while iDoc.readyState <> 'complete' do
Application.ProcessMessages;
//showmessage(idoc.url);
MyInnerText:=idoc.body.innerText;
MyInnerHTML:=idoc.body.innerHTML;
links := iDoc.all.tags('A');
Domain := GetDomain(MyURL);
if links.Length > 0 then
begin
for i := 0 to -1 + links.Length do
begin
aHref := links.Item(i).href;
if (Copy(aHref, 1, 6) = 'about:') and (Length(Domain) > 0) then
begin
J := Pos('/', aHref);
if (J > 0) then
begin
Delete(aHref, 1, J);
aHref := Domain + aHref;
end;
end;
MyHyperlinks.Add(aHref);
end;
end;
end;
Example 3 (IHTMLDocument4):
function process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
var MyInnerText,MyInnerHTML:widestring): Integer;
const
RS_COMPLETE = 'complete';
WaitMs1 = 3000;
WaitMs2 = 8000;
var
IDoc : IHTMLDocument2;
IDoc4: IHTMLDocument4;
Links: OleVariant;
AHref: String;
I : Integer;
Ms : Int64;
begin
Result := 1;
try
iDoc := coHTMLDocument.Create as IHTMLDocument2;
if (iDoc = nil) then
Exit(2);
Result := 3;
iDoc.Set_designMode('off');
Ms := GetTickCount64;
while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs1) do
begin
Sleep(10);
Application.ProcessMessages;
end;
if not (iDoc.ReadyState = RS_COMPLETE) then
Exit(4);
Result := 5;
iDoc4 := iDoc as IHTMLDocument4;
iDoc := iDoc4.CreateDocumentFromUrl(MyUrl, 'null');
Ms := GetTickCount64;
while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs2) do
begin
Sleep(20);
Application.ProcessMessages;
end;
if not (iDoc.ReadyState = RS_COMPLETE) then
Exit(6);
Result := 7;
MyInnerText := iDoc.Body.InnerText;
MyInnerHTML := iDoc.Body.InnerHTML;
Links := iDoc.All.Tags('A');
for I := 0 to Links.Length - 1 do
begin
aHref := links.Item(i).href;
MyHyperlinks.Add(aHref);
end;
Result := 0;
except
on E : Exception do
begin //ShowMessage('Exception: ' + E.ClassName + ',' + E.Message);
Result := 8;
end;
end;
end;