vbaexcelmsxmlmshtmlmsxml6

Setting a web table scraper by Msxml2.ServerXMLHTTP.6.0 on Excel VBA


I need to do a web data scraper.

  1. I need to login to the site: user, password, click login button
  2. Click a second button
  3. Wait for page to load, here is the Table in question. The table is a call log and adds new content dynamically, so it is always refreshing.
  4. I want to exclude a form from the table content and limit the rows pasted to Excel.

I make it work by InternetExplorer.Application code but I need to switch to MSXML2.XMLHTTP code because it is very slow.

Working InternetExplorer.Application Version:

 Sub extractTablesData()
 'we define the essential variables

 Dim IE As Object, obj As Object
 Dim r As Integer, c As Integer, t As Integer
 Dim elemCollection As Object


 'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
 Set IE = CreateObject("InternetExplorer.Application")

 With IE
 .Silent = True
 .Visible = True
 .navigate ("https://www.clickphone.ro")

 ' we ensure that the web page downloads completely before we fill the form automatically
 While IE.readyState <> 4
 DoEvents
 Wend
Application.Wait Now + TimeValue("00:00:03")
Set HTMLDoc = IE.document
 HTMLDoc.all.user.Value = "user or email" 'Enter your email/user id here
 HTMLDoc.all.pass.Value = "xXXxXXXxxXXXxx" 'Enter your password here
 'Login Button Click               
 With IE.document

    Set elems = .getElementsByTagName("a")
    For Each e In elems

        If (e.getAttribute("class") = "orange_button") Then
            e.Click
            Exit For
        End If

    Next e

End With

 'Needed Table page Button Click https://www.clickphone.ro/account/istoric_apel_in.html
 While IE.readyState <> 4
 DoEvents
 Wend
Set iedoc = IE.document

Set elems = iedoc.getElementsByClassName("black")(12)
    elems.Click

 ' again ensuring that the web page loads completely before we start scraping data
 While IE.readyState <> 4
 DoEvents
 Wend
 Application.Wait Now + TimeValue("00:00:05")
 Set iedoc = IE.document

'Clearing any unnecessary or old data in Sheet1

 ThisWorkbook.Sheets("Sheet1").Range("A1:K1000").ClearContents

'Scrapping Data and past to Sheet1
 Set elemCollection = IE.document.getElementsByTagName("table")

    For t = 0 To (elemCollection.Length - 1)
        For r = 0 To (elemCollection(t).Rows.Length - 1)
            For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
                ThisWorkbook.Worksheets(1).Cells(r + 1, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
            Next c
        Next r
    Next t

 End With

 ' cleaning up memory
 Set IE = Nothing

 End Sub

This is my attempt of MSXMLHTTP:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub Web_Table_Option_One()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.clickphone.ro/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send "userName=USER&password=XXXXxxxxXxxxxXXX"
.Open "GET", "https://www.clickphone.ro/account/istoric_apel_in.html", False 
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
 For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

HTML source code:

For user,pass,login button:

<form action="/login.html" id="toploginform" name="toploginform" method="post">
                                                                      <script>
                                            function processLoginForm(){
                                                with (document.toploginform) {
                                                    if (user.value=="Email"){alert('Email/Parola incorecte!'); return false}
                                                    document.getElementById('toploginform').submit();
                                                }
                                            }
                                        </script>

                                                                            <fieldset>
                                            <input name="userlogin" type="hidden" id="userlogin" value="true" />
                                            <span class="text">
                                            <input name="user" type="text" onFocus="if(this.value=='Email'){this.value=''}" onBlur="if(this.value==''){this.value='Email'}" value="Email">
                                            </span> <span class="text">
                                            <input name="pass" type="password" onFocus="if(this.value=='Password'){this.value=''}" onBlur="if(this.value==''){this.value='Password'}" value="Password">
                                            </span> 
                                            <input name="authcode" type="hidden" id="authcode" value="false" />
                                            <span><a href="#" class="orange_button" onClick="return processLoginForm()">Login</a></span>
                                             <span class="links"><a href="/login~parola.html">Am uitat parola</a><br/>
                                                <input class="css-checkbox" id="checkbox2" type="checkbox" name="rememberpass" value="da" />
                                                <label for="checkbox2" name="checkbox2_lbl" class="css-label lite-orange-check">Retin datele?</label>
                                        </span>
                                        </fieldset>                         
                                                                </form>

Table page button:

<br />&nbsp;<img src="/images/sageata_orange.gif" width="7" height="8" />&nbsp;<a class="black" href="/account/istoric_apel_in.html">Apeluri primite</a>

Table source code:

<table class="TabelDate" cellspacing="0">
  <thead>
    <tr>
      <th width="130">Data</th>
      <th>Sursa</th>
      <th>Destinatie</th>
      <th>Durata</th>
      <th class="ultima">Status</th>
    </tr>
  </thead>
  <tr class="u">    <td class="prima">19-03-2017 17:31:16</td><td><font color="green"><form name="form24-1489937476.41719" method="post" action="">0720145931 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0720145931</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0720145931.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0720145931" /></form></font></td><td align="center"><font color="green">0371780444</font></td><td align="center"><font color="green">00:00:07</font></td>
            <td class="ultima" align="center"><font color="green">Apel preluat</font></td></tr>  <tr class="gri">    <td class="prima">19-03-2017 17:30:48</td><td><font color="green"><form name="form24-1489937448.41715" method="post" action="">0728409617 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0728409617</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0728409617.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0728409617" /></form></font></td><td align="center"><font color="green">0371780655</font></td><td align="center"><font color="green">00:00:07</font></td>

Solution

  • I manage to partially resolve my problem. Now i can login and retrieve the table i need with XmlHttp. I'l post the working code here so every one can use it (i don't take any credits for this code, i did it with help from different forums)

    Option Explicit
     'reference to Microsoft Internet Controls
     'reference to Microsoft HTML Object Library
    
    Sub CallLog()
    Dim xml    As Object
    Dim html   As Object
    Dim objTable As Object
    Dim result As String
    Dim lRow As Long
    Dim lngTable As Long
    Dim lngRow As Long
    Dim lngCol As Long
    Dim ActRw As Long
    
    Set xml = CreateObject("MSXML2.XMLHTTP.6.0")
    
    Set html = CreateObject("htmlFile")
    
    With xml
    .Open "POST", "https://www.XXXXXX.xx/login.html", False
    .setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
    .send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
    'MsgBox xml.responseText
    .Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
    .setRequestHeader "Content-type", "text/xml"
    .send
    End With
    
    html.body.innerHTML = xml.responseText
    
    Set objTable = html.getElementsByTagName("table")
        For lngTable = 0 To objTable.Length - 1
            For lngRow = 0 To objTable(lngTable).Rows.Length - 1
                For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                    ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
                Next lngCol
            Next lngRow
            ActRw = ActRw + objTable(lngTable).Rows.Length + 1
        Next lngTable
    End Sub
    

    Now i'm left with the two problems... How can i get the children "table" from the parent "table" (the table i'm after is in a bigger table, see below source code) and i want to get only the first row, but excluding a "form" from the Row (it's a href link) Source Code

    How i can get this continuously (this table is dynamic, it's updating every time some one call me, this first Row, is updating continuously)