htmlexcelvbamsxmlqueryselector

How to parse content from HTML-code of a public website using VBA & MSXML2.XMLHTTP?


On this public website, depending on the selected federal state, 30 properties are listed on the right-hand side. If this federal state offers more than 30 properties, then another 30 are listed on the next page.

But when I try to parse the links to the properties with this code, I only get the 30 Objects that are listed on page 1, even if the URL is specified as Page=3:

Sub GetDataFromPublicSite()

Dim html As New HTMLDocument
Dim GetHref As String
Dim CountItems As Object
Dim x As Long, y As Long
            
With CreateObject("MSXML2.XMLHTTP.6.0")
    .Open "GET", "https://zvg.com/brandenburg?page=3", False
    .send
     html.body.innerHTML = .responseText
End With
        
Set CountItems = html.querySelectorAll("a")
    
For y = 0 To CountItems.length - 1

    If InStr(html.querySelectorAll("a").Item(y), "brandenburg") Then
    
       GetHref = html.querySelectorAll("a").Item(y).href

    End If

Next

End Sub

Any ideas? I would like to know how I can access the real estate links that are on pages 2,3,4...etc. Thanks!


Solution

  • Power Query is a great tool for scraping any kind of data. This code will get you started.

    Sub ZVGQueryNavigate(Url As String)
        Const QueryName As String = "ZVGQuery"
        Dim Query As WorkbookQuery
    
        ' Check if the query exists
        On Error Resume Next
        Set Query = ThisWorkbook.Queries(QueryName)
        On Error GoTo 0
    
        If Query Is Nothing Then
            ' If query doesn't exist, add it
            AddZVGQuery Url
        Else
            ' Update the query formula
            Query.Formula = GetFVGQueryFormula(Url)
            Query.Refresh
            ThisWorkbook.RefreshAll
        End If
    End Sub
    
    Sub AddZVGQuery(Optional Url As String = "https://zvgscout.com/brandenburg?page=3")
        Const QueryName As String = "ZVGQuery"
        Const TableName As String = "ZVGTable"
        
        Dim Formula As String
        Formula = GetFVGQueryFormula(Url)
        
        Dim Destination As Range
    
        ' Ensure the query does not already exist
        On Error Resume Next
        ThisWorkbook.Queries(QueryName).Delete
        On Error GoTo 0
    
        ' Set destination to the first cell of the active sheet
        Set Destination = ActiveSheet.Range("A1")
        
        ' Add the query to the workbook
        ActiveWorkbook.Queries.Add Name:=QueryName, Formula:=Formula
    
        ' Add a worksheet to display the query output
        Dim NewSheet As Worksheet
        Set NewSheet = ActiveWorkbook.Worksheets.Add
    
        ' Add the query to the table on the new sheet
        With NewSheet.ListObjects.Add(SourceType:=0, Source:= _
            "OLEDB;Provider=Microsoft.Mashup.OleDb.1;Data Source=$Workbook$;Location=" & Chr(34) & QueryName & Chr(34) & ";Extended Properties=""""" _
            , Destination:=NewSheet.Range("A1")).QueryTable
            .CommandType = xlCmdSql
            .CommandText = Array("SELECT * FROM [" & QueryName & "]")
            .RowNumbers = False
            .FillAdjacentFormulas = False
            .PreserveFormatting = True
            .RefreshOnFileOpen = False
            .BackgroundQuery = True
            .RefreshStyle = xlInsertDeleteCells
            .SavePassword = False
            .SaveData = True
            .AdjustColumnWidth = True
            .RefreshPeriod = 0
            .PreserveColumnInfo = True
            .ListObject.DisplayName = TableName
            .Refresh BackgroundQuery:=False
        End With
    End Sub
    
    
    Function GetFVGQueryFormula(Url As String) As String
        ' Define the query formula as lines
        Dim Lines(0 To 27) As String
        Lines(0) = "let"
        Lines(1) = "    Source = Web.BrowserContents(" & Chr(34) & Url & Chr(34) & "),"
        Lines(2) = "    #""Extracted Table From Html"" = Html.Table(Source, "
        Lines(3) = "        {"
        Lines(4) = "            {""Case Number"", "".lg\:text-2xl""}, "
        Lines(5) = "            {""Price"", "".text-blue-600""}, "
        Lines(6) = "            {""Property Type"", "".leading-6""}, "
        Lines(7) = "            {""Address"", "".leading-6 + *""}, "
        Lines(8) = "            {""Date & Time"", "".text-1xl:nth-child(5)""}, "
        Lines(9) = "            {""Court"", "".text-1xl *""}, "
        Lines(10) = "            {""Status"", "".top-3""}"
        Lines(11) = "        }, "
        Lines(12) = "        [RowSelector="".group""]"
        Lines(13) = "    ),"
        Lines(14) = "    #""Changed Type"" = Table.TransformColumnTypes("
        Lines(15) = "        #""Extracted Table From Html"","
        Lines(16) = "        {"
        Lines(17) = "            {""Case Number"", type text}, "
        Lines(18) = "            {""Price"", type text}, "
        Lines(19) = "            {""Property Type"", type text}, "
        Lines(20) = "            {""Address"", type text}, "
        Lines(21) = "            {""Date & Time"", type text}, "
        Lines(22) = "            {""Court"", type text}, "
        Lines(23) = "            {""Status"", type text}"
        Lines(24) = "        }"
        Lines(25) = "    )"
        Lines(26) = "in"
        Lines(27) = "    #""Changed Type"""
        GetFVGQueryFormula = Join(Lines, vbNewLine)
    End Function
    

    Result Query

    Update Using RegEx

    Sub Test_GetZVGScoutLinks()
        Dim Data As Variant
        Data = GetZVGScoutLinks("https://zvgscout.com/brandenburg?page=3")
        
        With Worksheets.Add
            .Range("A1").Resize(UBound(Data)).Value = Data
        End With
        
    End Sub
    
    Function GetZVGScoutLinks(Optional URL As String = "https://zvgscout.com/brandenburg?page=3", Optional Prefix As String = "https://zvgscout.com/")
        Dim Text As String
        Dim Matches As Object
        Dim RegEx As Object
        Dim Match As Object
        Dim Href As String
        
        With CreateObject("MSXML2.XMLHTTP.6.0")
            .Open "GET", URL, False
            .send
            Text = .responseText
        End With
    
        Set RegEx = CreateObject("VBScript.RegExp")
        With RegEx
            .Pattern = "<a[^>]*\bclass\s*=\s*""[^""]*(group|hover)[^""]*""[^>]*href\s*=\s*""([^""]+)"""
            .Global = True
            .IgnoreCase = True
        End With
        
        Dim Map As New Collection
        ' Find matches
        If RegEx.Test(Text) Then
            Set Matches = RegEx.Execute(Text)
            For Each Match In Matches
                Href = Match.SubMatches(1)
                Map.Add Prefix & Href
            Next
        Else
            MsgBox "No matching links found on the page!", vbExclamation
        End If
        Set RegEx = Nothing
        Set Matches = Nothing
        
        Dim Result As Variant
        ReDim Result(1 To Map.Count, 1 To 1)
        Dim n As Long
        For n = 1 To Map.Count
            Result(n, 1) = Map(n)
        Next
        
        GetZVGScoutLinks = Result
    End Function
    

    Result

    Extract Links using Power Query

    Sub ZVGQueryNavigate(URL As String)
        Const QueryName As String = "ZVGQuery"
        
        Dim Query As WorkbookQuery
    
        ' Check if the query exists
        On Error Resume Next
        Set Query = ThisWorkbook.Queries(QueryName)
        On Error GoTo 0
    
        If Query Is Nothing Then
            ' If query doesn't exist, add it
            AddZVGQuery URL
        Else
            ' Update the query formula
            Query.Formula = GetFVGQueryFormula(URL)
            Query.Refresh
            ThisWorkbook.RefreshAll
        End If
    End Sub
    
    Sub AddZVGQuery(Optional URL As String = "https://zvgscout.com/brandenburg?page=3")
        Const QueryName As String = "ZVGQuery"
        Const TableName As String = "ZVGTable"
        
        Dim Formula As String
        Formula = GetFVGQueryFormula(URL)
        
        Dim Destination As Range
    
        ' Ensure the query does not already exist
        On Error Resume Next
        ThisWorkbook.Queries(QueryName).Delete
        On Error GoTo 0
    
        ' Set destination to the first cell of the active sheet
        Set Destination = ActiveSheet.Range("A1")
        
        ' Add the query to the workbook
        ActiveWorkbook.Queries.Add Name:=QueryName, Formula:=Formula
    
        ' Add a worksheet to display the query output
        Dim NewSheet As Worksheet
        Set NewSheet = ActiveWorkbook.Worksheets.Add
    
        ' Add the query to the table on the new sheet
        With NewSheet.ListObjects.Add(SourceType:=0, Source:= _
            "OLEDB;Provider=Microsoft.Mashup.OleDb.1;Data Source=$Workbook$;Location=" & Chr(34) & QueryName & Chr(34) & ";Extended Properties=""""" _
            , Destination:=NewSheet.Range("A1")).QueryTable
            .CommandType = xlCmdSql
            .CommandText = Array("SELECT * FROM [" & QueryName & "]")
            .RowNumbers = False
            .FillAdjacentFormulas = False
            .PreserveFormatting = True
            .RefreshOnFileOpen = False
            .BackgroundQuery = True
            .RefreshStyle = xlInsertDeleteCells
            .SavePassword = False
            .SaveData = True
            .AdjustColumnWidth = True
            .RefreshPeriod = 0
            .PreserveColumnInfo = True
            .ListObject.DisplayName = TableName
            .Refresh BackgroundQuery:=False
        End With
    End Sub
    
    
    Function GetFVGQueryFormula(URL As String) As String
        ' Define the query formula as lines
        Dim Lines(0 To 9) As String
        Lines(0) = "let"
        Lines(1) = "    Source = Web.BrowserContents(" & Chr(34) & URL & Chr(34) & "),"
        Lines(2) = "    #""Extracted Links From Html"" = Html.Table("
        Lines(3) = "        Source, "
        Lines(4) = "        {{""Links"", ""a.group"", each [Attributes][href]}}, "
        Lines(5) = "        [RowSelector="".group""]"
        Lines(6) = "    ),"
        Lines(7) = "    #""Removed Nulls"" = Table.SelectRows(#""Extracted Links From Html"", each [Links] <> null)"
        Lines(8) = "in"
        Lines(9) = "    #""Removed Nulls"""
        GetFVGQueryFormula = Join(Lines, vbNewLine)
    End Function