Scrape complete webpagina

Nog een voorbeeld van ‘webscraping’ om de online voorpagina van The Guardian van het web te halen. Het resultaat wordt geschreven naar het bestand:
C:\temp\textfile.html
Zorg dat je de map C:\temp maakt. Het bestand textfile.html wordt automatisch gemaakt als het nog niet aanwezig/gemaakt is.
Webscraping is het geautomatiseerd gegevens verzamelen van webpagina’s

Option Explicit

'Tools->References Microsoft HTML Object Library

'MSDN - URLDownloadToFile function - https://msdn.microsoft.com/en-us/library/ms775123(v=vs.85).aspx
Private Declare PtrSafe Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" _
        (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, _
        ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long

Sub scrape_complete_webpage()

    Dim fso As Object
    Set fso = CreateObject("Scripting.FileSystemObject")

    Dim sLocalFilename As String
    sLocalFilename = Environ$("TMP") & "\urlmon.html"
    
    Dim sURL As String
    
    sURL = "https://www.theguardian.com/europe"
    'sURL = "https://www.theguardian.com/artanddesign/gallery/2015/sep/25/inside-un-headquarters-in-pictures"
    Dim bOk As Boolean
    bOk = (URLDownloadToFile(0, sURL, sLocalFilename, 0, 0) = 0)
    If bOk Then
        If fso.FileExists(sLocalFilename) Then
        
            'Tools->References Microsoft HTML Object Library
            Dim oHtml4 As MSHTML.IHTMLDocument4
            Set oHtml4 = New MSHTML.HTMLDocument
            
            Dim oHtml As MSHTML.HTMLDocument
            Set oHtml = Nothing
            
            'IHTMLDocument4.createDocumentFromUrl
            'MSDN - IHTMLDocument4 createDocumentFromUrl method - https://msdn.microsoft.com/en-us/library/aa752523(v=vs.85).aspx
            Set oHtml = oHtml4.createDocumentFromUrl(sLocalFilename, "")
            
            'need to wait a little whilst the document parses
            'because it is multithreaded
            While oHtml.readyState <> "complete"
                DoEvents  'do not comment this out it is required to break into the code if in infinite loop
            Wend
            Debug.Assert oHtml.readyState = "complete"
            

            Dim sTest As String
            sTest = Left$(oHtml.body.outerHTML, 100)
            Debug.Assert Len(Trim(sTest)) > 50  'just testing we got a substantial block of text, feel free to delete
            
            ' Get elements by class name
            Dim elems, i
            Set elems = oHtml.getElementsByClassName("top-fronts-banner-ad-container dcr-12mgsnl")
            
            ' Loop through elements and remove them
            For i = elems.Length - 1 To 0 Step -1
                elems.Item(i).ParentNode.RemoveChild elems.Item(i)
            Next i
            
            LogInformation (oHtml.body.outerHTML)
        End If
    End If
End Sub

Sub LogInformation(LogMessage As String)
Dim fileNum As Integer, x1 As String, x2 As String
Const LogFileName As String = "C:\temp\textfile.html"
    
    Open "C:\temp\textfile.html" For Output As #1: Close #1
    'MsgBox "Clear complete"

    fileNum = FreeFile ' next file number
    Open LogFileName For Append As #fileNum ' creates the file if it doesn't exist
    
    'remove everything between <header data-component="header"> and </header> including these two tags
    x1 = Split(LogMessage, "<div id=""bannerandheader"" data-print-layout=""hide"">")(0) ' the text before <div id="bannerandheader" data-print-layout="hide">
    x2 = Split(LogMessage, "</header>")(1) ' the text after </header>
    LogMessage = x1 & x2

    LogMessage = Replace(LogMessage, Range("A5"), Range("A6"))
    Print #fileNum, LogMessage ' write information at the end of the text file
    Close #fileNum ' close the file
End Sub

Leave a Reply

Your email address will not be published. Required fields are marked *