Web scraping
We have a preferred language and I know, for some, VBA is not one of them. Let's pause for a second and admit that VBA is still potent with web scraping. Although, it still based on a internet explorer library. But, I should say it still perfect for tricking robot.txt to some website in dealing with automated crawling.
The Goal
We need to do a script that will scrape email addresses. Yes, it's a goldmine for the email marketer. I will attempt to scrape publicly shown contact information. For this demo, I will scrape Students or potentially school staffs email address. The only drawback with all web scraping techniques is the maximum number of attempts allowed for some website. Still relate's to the robot.txt setup. But, you can always delay between calls. If you do bulk fetch. Here are the steps and let me know in the comment section below for your thoughts.
- Let's do https://directory.utexas.edu/index.php? It's University of Texas directory
- Let's try to fetch contact details for top used names "James". It will return all names with it's corresponding url to individual info page
- Scrape the email address and other relevant info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sub GETCONTACTINFO(href As String, ws As Worksheet) | |
'This will scrape the individual contact infos from URL | |
'THIS WILL EXTRACT THE ACTUAL CONTACT INFORMAT FROM THE INDIVIDUAL INFO PAGE | |
Dim irow As Integer, colCount As Integer | |
Dim IE As New SHDocVw.InternetExplorer | |
Dim HTMLDoc As MSHTML.HTMLDocument | |
Dim HTMLDocElements As MSHTML.IHTMLElementCollection | |
Dim HTMLDocElement As MSHTML.IHTMLElement | |
'THIS IS THE INITIAL URL CRAWL OF THE MAIN PRODUCT PAGE | |
IE.navigate "https://directory.utexas.edu/index.php?q=James&scope=student&i=46" | |
IE.Visible = True | |
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO | |
Do While IE.readyState <> READYSTATE_COMPLETE | |
Application.StatusBar = "Loading page main project page..." | |
Loop | |
Set HTMLDoc = IE.document | |
'THIS IS THE TARGET HTML TAG | |
Set HTMLDocElements = HTMLDoc.getElementsByTagName("tr") | |
If HTMLDocElements.Length <> 0 Then | |
colCount = 1 | |
irow = Cells(Rows.Count, 1).End(xlUp).Row + 1 | |
'ITERATE EACH HTML ELEMENTS COLLECTED | |
For Each HTMLDocElement In HTMLDocElements | |
Dim innerText As String | |
Dim innerTextSplit As Variant | |
innerText = HTMLDocElement.innerText | |
innerTextSplit = Split(innerText, ":") | |
'WRITE THE URL'S | |
ws.Cells(irow, colCount).Value = innerTextSplit(UBound(innerTextSplit)) | |
colCount = colCount + 1 | |
Next HTMLDocElement | |
End If | |
IE.Quit | |
End Sub |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sub GETALLCONTACTINFOS() | |
'This will scrape in bulk all contact URL's fetched | |
'THIS WILL DO BULK FETCH | |
Dim ws As Worksheet, ws1 As Worksheet | |
Dim urlRng As Range, rng As Range | |
Set ws = Sheets("utexas students dir") | |
Set urlRng = ws.Range("A2:A" & ws.Cells(Rows.Count, 1).End(xlUp).Row + 1) | |
Sheets.Add After:=ActiveSheet | |
Set ws1 = ActiveSheet | |
Application.ScreenUpdating = False | |
For Each rng In urlRng | |
GETCONTACTINFO rng.Value, ws1 | |
Next rng | |
Application.ScreenUpdating = True | |
End Sub |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sub GETINDIVIDUALCONTACTURLS() | |
'This will extract all contact URL results | |
'THIS WILL SCRAPE ALL CONTACT URLS FOR A SPECIFIC NAME | |
'AND LIST ALL THOSE URLS IN A SHEET | |
Dim href As String | |
Dim targetName As String | |
Dim ws As Worksheet | |
Dim irow As Integer | |
Dim IE As New SHDocVw.InternetExplorer | |
Dim HTMLDoc As MSHTML.HTMLDocument | |
Dim HTMLDocElements As MSHTML.IHTMLElementCollection | |
Dim HTMLDocElement As MSHTML.IHTMLElement | |
Set ws = Sheets("utexas students dir") '<--- you can create another tab to where we will write the fetched URLs | |
targetName = "James" '<--- you can change target name here | |
href = "https://directory.utexas.edu/index.php?q=" & targetName & "&scope=student&submit=Search" | |
'THIS IS THE INITIAL URL CRAWL OF THE MAIN PRODUCT PAGE | |
IE.navigate href | |
IE.Visible = False | |
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO | |
Do While IE.readyState <> READYSTATE_COMPLETE | |
Application.StatusBar = "Loading page main project page..." | |
Loop | |
Set HTMLDoc = IE.document | |
'THIS IS THE TARGET HTML TAG | |
Set HTMLDocElements = HTMLDoc.getElementsByTagName("a") | |
If HTMLDocElements.Length <> 0 Then | |
'ITERATE EACH HTML ELEMENTS COLLECTED | |
For Each HTMLDocElement In HTMLDocElements | |
'FILTER OUT THE NON TARGET a TAG | |
If HTMLDocElement.innerText <> "About the Directory " _ | |
And HTMLDocElement.innerText <> "Frequently Asked Questions" _ | |
And HTMLDocElement.innerText <> "University Offices" _ | |
And HTMLDocElement.innerText <> "UT System Administration Directory" _ | |
And HTMLDocElement.innerText <> "Advanced Search" _ | |
And HTMLDocElement.innerText <> "UT Austin Home" _ | |
And HTMLDocElement.innerText <> "Emergency Information" _ | |
And HTMLDocElement.innerText <> "Site Policies" _ | |
And HTMLDocElement.innerText <> "Web Accessibility Policy" _ | |
And HTMLDocElement.innerText <> "Web Privacy Policy" _ | |
And HTMLDocElement.innerText <> "Adobe Reader" _ | |
And HTMLDocElement.innerText <> "Skip to main content" _ | |
And HTMLDocElement.className <> "logo" Then | |
If HTMLDocElement.innerText <> "" Then | |
'DETERMINE LAST NON-EMPTY ROW IN A COLUMN | |
irow = ws.Cells(Rows.Count, 1).End(xlUp).Row + 1 | |
'WRITE THE URL'S | |
ws.Cells(irow, 1).Value = HTMLDocElement.getAttribute("href") | |
End If | |
End If | |
Next HTMLDocElement | |
End If | |
IE.Quit | |
End Sub |
No comments:
Post a Comment