I need to do a web data scraper.
I make it work by InternetExplorer.Application code but I need to switch to MSXML2.XMLHTTP code because it is very slow.
Working InternetExplorer.Application Version:
Sub extractTablesData()
'we define the essential variables
Dim IE As Object, obj As Object
Dim r As Integer, c As Integer, t As Integer
Dim elemCollection As Object
'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
Set IE = CreateObject("InternetExplorer.Application")
With IE
.Silent = True
.Visible = True
.navigate ("https://www.clickphone.ro")
' we ensure that the web page downloads completely before we fill the form automatically
While IE.readyState <> 4
DoEvents
Wend
Application.Wait Now + TimeValue("00:00:03")
Set HTMLDoc = IE.document
HTMLDoc.all.user.Value = "user or email" 'Enter your email/user id here
HTMLDoc.all.pass.Value = "xXXxXXXxxXXXxx" 'Enter your password here
'Login Button Click
With IE.document
Set elems = .getElementsByTagName("a")
For Each e In elems
If (e.getAttribute("class") = "orange_button") Then
e.Click
Exit For
End If
Next e
End With
'Needed Table page Button Click https://www.clickphone.ro/account/istoric_apel_in.html
While IE.readyState <> 4
DoEvents
Wend
Set iedoc = IE.document
Set elems = iedoc.getElementsByClassName("black")(12)
elems.Click
' again ensuring that the web page loads completely before we start scraping data
While IE.readyState <> 4
DoEvents
Wend
Application.Wait Now + TimeValue("00:00:05")
Set iedoc = IE.document
'Clearing any unnecessary or old data in Sheet1
ThisWorkbook.Sheets("Sheet1").Range("A1:K1000").ClearContents
'Scrapping Data and past to Sheet1
Set elemCollection = IE.document.getElementsByTagName("table")
For t = 0 To (elemCollection.Length - 1)
For r = 0 To (elemCollection(t).Rows.Length - 1)
For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
ThisWorkbook.Worksheets(1).Cells(r + 1, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
Next c
Next r
Next t
End With
' cleaning up memory
Set IE = Nothing
End Sub
This is my attempt of MSXMLHTTP:
Option Explicit
'reference to Microsoft Internet Controls
'reference to Microsoft HTML Object Library
Sub Web_Table_Option_One()
Dim xml As Object
Dim html As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long
Set xml = CreateObject("MSXML2.XMLHTTP.6.0")
Set html = CreateObject("htmlFile")
With xml
.Open "POST", "https://www.clickphone.ro/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send "userName=USER&password=XXXXxxxxXxxxxXXX"
.Open "GET", "https://www.clickphone.ro/account/istoric_apel_in.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With
html.body.innerHTML = xml.responseText
Set objTable = html.getElementsByTagName("table")
For lngTable = 0 To objTable.Length - 1
For lngRow = 0 To objTable(lngTable).Rows.Length - 1
For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
Next lngCol
Next lngRow
ActRw = ActRw + objTable(lngTable).Rows.Length + 1
Next lngTable
End Sub
HTML source code:
For user,pass,login button:
<form action="/login.html" id="toploginform" name="toploginform" method="post">
<script>
function processLoginForm(){
with (document.toploginform) {
if (user.value=="Email"){alert('Email/Parola incorecte!'); return false}
document.getElementById('toploginform').submit();
}
}
</script>
<fieldset>
<input name="userlogin" type="hidden" id="userlogin" value="true" />
<span class="text">
<input name="user" type="text" onFocus="if(this.value=='Email'){this.value=''}" onBlur="if(this.value==''){this.value='Email'}" value="Email">
</span> <span class="text">
<input name="pass" type="password" onFocus="if(this.value=='Password'){this.value=''}" onBlur="if(this.value==''){this.value='Password'}" value="Password">
</span>
<input name="authcode" type="hidden" id="authcode" value="false" />
<span><a href="#" class="orange_button" onClick="return processLoginForm()">Login</a></span>
<span class="links"><a href="/login~parola.html">Am uitat parola</a><br/>
<input class="css-checkbox" id="checkbox2" type="checkbox" name="rememberpass" value="da" />
<label for="checkbox2" name="checkbox2_lbl" class="css-label lite-orange-check">Retin datele?</label>
</span>
</fieldset>
</form>
Table page button:
<br /> <img src="/images/sageata_orange.gif" width="7" height="8" /> <a class="black" href="/account/istoric_apel_in.html">Apeluri primite</a>
Table source code:
<table class="TabelDate" cellspacing="0">
<thead>
<tr>
<th width="130">Data</th>
<th>Sursa</th>
<th>Destinatie</th>
<th>Durata</th>
<th class="ultima">Status</th>
</tr>
</thead>
<tr class="u"> <td class="prima">19-03-2017 17:31:16</td><td><font color="green"><form name="form24-1489937476.41719" method="post" action="">0720145931 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0720145931</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0720145931.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a> <input name="numartel" type="hidden" id="numartel" value="0720145931" /></form></font></td><td align="center"><font color="green">0371780444</font></td><td align="center"><font color="green">00:00:07</font></td>
<td class="ultima" align="center"><font color="green">Apel preluat</font></td></tr> <tr class="gri"> <td class="prima">19-03-2017 17:30:48</td><td><font color="green"><form name="form24-1489937448.41715" method="post" action="">0728409617 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0728409617</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0728409617.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a> <input name="numartel" type="hidden" id="numartel" value="0728409617" /></form></font></td><td align="center"><font color="green">0371780655</font></td><td align="center"><font color="green">00:00:07</font></td>
I manage to partially resolve my problem. Now i can login and retrieve the table i need with XmlHttp. I'l post the working code here so every one can use it (i don't take any credits for this code, i did it with help from different forums)
Option Explicit
'reference to Microsoft Internet Controls
'reference to Microsoft HTML Object Library
Sub CallLog()
Dim xml As Object
Dim html As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long
Set xml = CreateObject("MSXML2.XMLHTTP.6.0")
Set html = CreateObject("htmlFile")
With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With
html.body.innerHTML = xml.responseText
Set objTable = html.getElementsByTagName("table")
For lngTable = 0 To objTable.Length - 1
For lngRow = 0 To objTable(lngTable).Rows.Length - 1
For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
Next lngCol
Next lngRow
ActRw = ActRw + objTable(lngTable).Rows.Length + 1
Next lngTable
End Sub
Now i'm left with the two problems... How can i get the children "table" from the parent "table" (the table i'm after is in a bigger table, see below source code) and i want to get only the first row, but excluding a "form" from the Row (it's a href link) Source Code
How i can get this continuously (this table is dynamic, it's updating every time some one call me, this first Row, is updating continuously)