Nog een voorbeeld van ‘webscraping’ om de online voorpagina van The Guardian van het web te halen. Het resultaat wordt geschreven naar het bestand:
C:\temp\textfile.html
Zorg dat je de map C:\temp maakt. Het bestand textfile.html wordt automatisch gemaakt als het nog niet aanwezig/gemaakt is.
Webscraping is het geautomatiseerd gegevens verzamelen van webpagina’s
Option Explicit
'Tools->References Microsoft HTML Object Library
'MSDN - URLDownloadToFile function - https://msdn.microsoft.com/en-us/library/ms775123(v=vs.85).aspx
Private Declare PtrSafe Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" _
(ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, _
ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
Sub scrape_complete_webpage()
Dim fso As Object
Set fso = CreateObject("Scripting.FileSystemObject")
Dim sLocalFilename As String
sLocalFilename = Environ$("TMP") & "\urlmon.html"
Dim sURL As String
sURL = "https://www.theguardian.com/europe"
'sURL = "https://www.theguardian.com/artanddesign/gallery/2015/sep/25/inside-un-headquarters-in-pictures"
Dim bOk As Boolean
bOk = (URLDownloadToFile(0, sURL, sLocalFilename, 0, 0) = 0)
If bOk Then
If fso.FileExists(sLocalFilename) Then
'Tools->References Microsoft HTML Object Library
Dim oHtml4 As MSHTML.IHTMLDocument4
Set oHtml4 = New MSHTML.HTMLDocument
Dim oHtml As MSHTML.HTMLDocument
Set oHtml = Nothing
'IHTMLDocument4.createDocumentFromUrl
'MSDN - IHTMLDocument4 createDocumentFromUrl method - https://msdn.microsoft.com/en-us/library/aa752523(v=vs.85).aspx
Set oHtml = oHtml4.createDocumentFromUrl(sLocalFilename, "")
'need to wait a little whilst the document parses
'because it is multithreaded
While oHtml.readyState <> "complete"
DoEvents 'do not comment this out it is required to break into the code if in infinite loop
Wend
Debug.Assert oHtml.readyState = "complete"
Dim sTest As String
sTest = Left$(oHtml.body.outerHTML, 100)
Debug.Assert Len(Trim(sTest)) > 50 'just testing we got a substantial block of text, feel free to delete
' Get elements by class name
Dim elems, i
Set elems = oHtml.getElementsByClassName("top-fronts-banner-ad-container dcr-12mgsnl")
' Loop through elements and remove them
For i = elems.Length - 1 To 0 Step -1
elems.Item(i).ParentNode.RemoveChild elems.Item(i)
Next i
LogInformation (oHtml.body.outerHTML)
End If
End If
End Sub
Sub LogInformation(LogMessage As String)
Dim fileNum As Integer, x1 As String, x2 As String
Const LogFileName As String = "C:\temp\textfile.html"
Open "C:\temp\textfile.html" For Output As #1: Close #1
'MsgBox "Clear complete"
fileNum = FreeFile ' next file number
Open LogFileName For Append As #fileNum ' creates the file if it doesn't exist
'remove everything between <header data-component="header"> and </header> including these two tags
x1 = Split(LogMessage, "<div id=""bannerandheader"" data-print-layout=""hide"">")(0) ' the text before <div id="bannerandheader" data-print-layout="hide">
x2 = Split(LogMessage, "</header>")(1) ' the text after </header>
LogMessage = x1 & x2
LogMessage = Replace(LogMessage, Range("A5"), Range("A6"))
Print #fileNum, LogMessage ' write information at the end of the text file
Close #fileNum ' close the file
End Sub