delphihyperlinkihtmldocument2

I have a problem getting hyperlinks from IHTMLDocument2 in Delphi


I have a problem getting hyperlinks from IHTMLDocument2 in Delphi. For instance, instead of returning the full link "http://ena.ge/explanatory-online", IHTMLDocument2 returns "about:/explanatory-online". The simple substitution of "about" with root URL is not working for all cases.

Here is the code I am using:

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings; var MyInnerText,MyInnerHTML:widestring);
var
  resp: TMemoryStream;
  IdHTTP: TidHTTP;
  v: Variant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref: string;
  i: integer;

begin
  resp := TMemoryStream.Create;
  IdHTTP := TidHTTP.Create(nil);
  iDoc := coHTMLDocument.Create as IHTMLDocument2;

  try
    IdHTTP.Get(MyURL, resp);

    resp.Position := 0;
    MyHTML.LoadFromStream(resp,TEncoding.UTF8);

  finally
    resp.Free;
    IdHTTP.Free;
  end;

  v := VarArrayCreate([0, 0], VarVariant);
  v[0] := MyHTML.text;
  iDoc.write(PSafeArray(System.TVarData(v).VArray));
  iDoc.designMode := 'off';

  while iDoc.readyState <> 'complete' do
    Application.ProcessMessages;

  showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;

      MyHyperlinks.Add(aHref);
    end;
  end;

end;



Solution

  • Look at the source of the page and you will see what the links look like, for example: href="/explanatory-online" If you download the IdHttp page, IHTMLDocument2 does not have the original page address. You can use TWebBrowser or manually replace string or use IHTMLDocument4.

    Example 1 (TWebBrowser):

    procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
        var MyInnerText,MyInnerHTML:widestring);
    var
      Flags: System.OleVariant;
      iDoc: IHTMLDocument2;
      links: OleVariant;
      MyHyperlink, aHref: string;
      i: integer;
    begin
      Flags := Flags or navNoReadFromCache or navNoWriteToCache;
      Form1.WebBrowser1.Silent := True;
      Form1.WebBrowser1.Navigate(MyURL, Flags);
      while Form1.WebBrowser1.ReadyState <> READYSTATE_COMPLETE do
        Application.ProcessMessages;
      iDoc := Form1.WebBrowser1.Document as IHTMLDocument2;
      //showmessage(idoc.url);
      MyInnerText:=idoc.body.innerText;
      MyInnerHTML:=idoc.body.innerHTML;
      links := iDoc.all.tags('A');
      if links.Length > 0 then
      begin
        for i := 0 to -1 + links.Length do
        begin
          aHref := links.Item(i).href;
          MyHyperlinks.Add(aHref);
        end;
      end;
    end;
    

    Example 2 (replace string):

    function GetDomain(URL: String): String;
    var
      Pos1, Pos2: Integer;
    begin
      Result := '';
      URL := Trim(URL);
      Pos1 := LastDelimiter('/', URL);
      Pos2 := Pos('/', URL, Pos1 + 1);
      if (Pos2 = 0) then
        Result := URL + '/'
      else if (Pos1 > 0) then
        Result := Copy(Url, 1, Pos1);
    end;
    
    procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
      var MyInnerText, MyInnerHTML: WideString);
    var
      resp: TMemoryStream;
      IdHTTP: TidHTTP;
      v: Variant;
      iDoc: IHTMLDocument2;
      links: OleVariant;
      MyHyperlink, aHref, Domain: string;
      I, J: Integer;
    begin
      resp := TMemoryStream.Create;
      IdHTTP := TidHTTP.Create(nil);
      iDoc := coHTMLDocument.Create as IHTMLDocument2;
      try
        IdHTTP.Get(MyURL, resp);
        resp.Position := 0;
        MyHTML.LoadFromStream(resp,TEncoding.UTF8);
      finally
        resp.Free;
        IdHTTP.Free;
      end;
      v := VarArrayCreate([0, 0], VarVariant);
      v[0] := MyHTML.text;
      iDoc.write(PSafeArray(System.TVarData(v).VArray));
      iDoc.designMode := 'off';
      while iDoc.readyState <> 'complete' do
        Application.ProcessMessages;
      //showmessage(idoc.url);
      MyInnerText:=idoc.body.innerText;
      MyInnerHTML:=idoc.body.innerHTML;
      links := iDoc.all.tags('A');
      Domain := GetDomain(MyURL);
      if links.Length > 0 then
      begin
        for i := 0 to -1 + links.Length do
        begin
          aHref := links.Item(i).href;
          if (Copy(aHref, 1, 6) = 'about:') and (Length(Domain) > 0) then
          begin
            J := Pos('/', aHref);
            if (J > 0) then
            begin
              Delete(aHref, 1, J);
              aHref := Domain + aHref;
            end;
          end;
          MyHyperlinks.Add(aHref);
        end;
      end;
    end;
    

    Example 3 (IHTMLDocument4):

    function process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
        var MyInnerText,MyInnerHTML:widestring): Integer;
    const
      RS_COMPLETE = 'complete';
      WaitMs1     = 3000;
      WaitMs2     = 8000;
    var
      IDoc : IHTMLDocument2;
      IDoc4: IHTMLDocument4;
      Links: OleVariant;
      AHref: String;
      I    : Integer;
      Ms   : Int64;
    begin
      Result := 1;
      try
        iDoc := coHTMLDocument.Create as IHTMLDocument2;
        if (iDoc = nil) then
          Exit(2);
        Result := 3;
        iDoc.Set_designMode('off');
        Ms := GetTickCount64;
        while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs1) do
        begin
          Sleep(10);
          Application.ProcessMessages;
        end;
        if not (iDoc.ReadyState = RS_COMPLETE) then
          Exit(4);
        Result := 5;
        iDoc4 := iDoc as IHTMLDocument4;
        iDoc := iDoc4.CreateDocumentFromUrl(MyUrl, 'null');
        Ms := GetTickCount64;
        while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs2) do
        begin
          Sleep(20);
          Application.ProcessMessages;
        end;
        if not (iDoc.ReadyState = RS_COMPLETE) then
          Exit(6);
        Result := 7;
        MyInnerText := iDoc.Body.InnerText;
        MyInnerHTML := iDoc.Body.InnerHTML;
        Links := iDoc.All.Tags('A');
        for I := 0 to Links.Length - 1 do
        begin
          aHref := links.Item(i).href;
          MyHyperlinks.Add(aHref);
        end;
        Result := 0;
      except
         on E : Exception do
         begin //ShowMessage('Exception: ' + E.ClassName + ',' + E.Message);
           Result := 8;
         end;
      end;
    end;