The script that I'm going to share is good only for single product URL scraping. This is to demonstrate how VBA can still be effective for those low tier automation implementation. Basically, applicable to developers still in exploring phase, students and hobbyist just want to have some fun.
I've taken a simple approach and haven't taken time to document. This is a project just came to at instance decided to write this in Excel VBA. Depending on the number reviews in that product, the script will take 2~3 mins to run. Let me know in the comments below your thoughts.
To run the code, you need to select "GETALLREVIEWS" in your macro's list and you will be prompted with an input box where you can paste the URL of the target product listed in Amazon. The script will return to you a new spreadsheet with the list of the reviews successfully scraped.
The programming language used in this application is VBA(Visual Basic for Application). If you want to learn this coding technique, click Buy Now button below for a book to guide you with your learning.
Here's the script. Have fun
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Attribute VB_Name = "Amazon_Reviews_Scrape" | |
Function GETPAGEREVIEWSURL(href As String, pageNum As Variant) As String | |
Dim splitter As Variant | |
splitter = Split(href, "/") | |
splitter(4) = "product-reviews" | |
If pageNum = 1 Then | |
splitter(6) = "ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews" | |
ElseIf pageNum = 2 Then | |
splitter(6) = "ref=cm_cr_arp_d_paging_btm_next_" & pageNum & "?ie=UTF8&reviewerType=all_reviews&pageNumber=" & pageNum | |
Else | |
splitter(6) = "ref=cm_cr_getr_d_paging_btm_next_" & pageNum & "?ie=UTF8&reviewerType=all_reviews&pageNumber=" & pageNum | |
End If | |
GETPAGEREVIEWSURL = Join(splitter, "/") | |
End Function | |
Function GETPROD_AMAZONPARAM(href As String) As Object | |
Dim param As New Collection | |
Dim splitter As Variant | |
splitter = Split(href, "/") | |
param.Add splitter(5) | |
param.Add splitter(3) | |
Set GETPROD_AMAZONPARAM = param | |
End Function | |
Function GETREVIEWURL(ASIN As String, ID As String) As String | |
GETREVIEWURL = "https://www.amazon.com/gp/customer-reviews/" & ID & "/ref=cm_cr_arp_d_rvw_ttl?ie=UTF8&ASIN=" & ASIN | |
End Function | |
Function GETREVIEWSCORE(score_string As String) As Integer | |
Dim splitter As Variant | |
splitter = Split(score_string, " out of ") | |
GETREVIEWSCORE = CInt(splitter(0)) | |
End Function | |
Function GETREVIEWCOUNTRY_DATE(countryDateString As String) As Object | |
Dim revieTiming As New Collection | |
Dim splitter As Variant | |
Dim inTheSplitCheck As Variant | |
splitter = Split(countryDateString, " on ") | |
revieTiming.Add splitter(1) | |
inTheSplitCheck = Split(splitter(0), " in the ") | |
If UBound(inTheSplitCheck) = 0 Then inTheSplitCheck = Split(splitter(0), " in ") | |
revieTiming.Add inTheSplitCheck(1) | |
Set GETREVIEWCOUNTRY_DATE = revieTiming | |
End Function | |
Function GETUPVOTECOUNT(upvoteStr As String) As Integer | |
Dim splitter As Variant | |
splitter = Split(upvoteStr, " ") | |
If splitter(0) = "One" Then splitter(0) = 1 | |
GETUPVOTECOUNT = CInt(splitter(0)) | |
End Function | |
Function GETREVIEWSPAGESCOUNT(el As MSHTML.IHTMLElement) As Integer | |
Dim el_innerText As String | |
Dim splitter As Variant | |
Dim pageCount As Integer | |
el_innerText = el.innerText | |
splitter = Split(el_innerText, " of ") | |
splitter = Split(splitter(1), " ") | |
pageCount = CInt(splitter(0)) | |
If pageCount Mod 10 > 0 Then pageCount = ((pageCount / 10) - (pageCount Mod 10 / 10)) + 1 | |
GETREVIEWSPAGESCOUNT = pageCount | |
End Function | |
Function GETREVIEWIDS(href As String) As Object | |
Dim href1 As String | |
Dim IE As New SHDocVw.InternetExplorer, IE2 As New SHDocVw.InternetExplorer | |
Dim HTMLDoc As MSHTML.HTMLDocument, HTMLDoc2 As MSHTML.HTMLDocument, HTMLDoc3 As MSHTML.HTMLDocument | |
Dim HTMLDocElements As MSHTML.IHTMLElementCollection | |
Dim HTMLDocElement As MSHTML.IHTMLElement | |
Dim REVIEW_IDS As New Collection | |
Dim US_TOTAL_PAGE_REVIEWS As Integer | |
'THIS IS THE INITIAL URL CRAWL OF THE MAIN PRODUCT PAGE | |
IE.navigate href | |
IE.Visible = False | |
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO | |
Do While IE.readyState <> READYSTATE_COMPLETE | |
Application.StatusBar = "Loading page main project page..." | |
Loop | |
Set HTMLDoc = IE.document | |
'GET THE INTERNATIONAL REVIEWS ID | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE INTERNATIONAL REVIEWS ARE CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-section review aok-relative cr-desktop-review-page-0") | |
If HTMLDocElements.Length <> 0 Then | |
Debug.Print vbCr & "International Reviews" & vbCr & "-----------------------------" | |
Application.StatusBar = "Extracting International Review IDs..." | |
'ITERATE EACH HTML ELEMENTS COLLECTED | |
For Each HTMLDocElement In HTMLDocElements | |
'COLLECT THE EXTRACTED REVIEW ID | |
REVIEW_IDS.Add HTMLDocElement.getAttribute("id") | |
Debug.Print HTMLDocElement.getAttribute("id") | |
Next HTMLDocElement | |
End If | |
'CAPTURE THE HIDDEN FRAMES WHERE SOME OF THE INTERNATIONAL REVIEWS ARE CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-section review aok-relative cr-desktop-review-page-1 aok-hidden") | |
If HTMLDocElements.Length <> 0 Then | |
Debug.Print vbCr & "Hidden International Reviews" & vbCr & "-----------------------------" | |
Application.StatusBar = "Extracting Hidden International Review IDs..." | |
'ITERATE EACH HTML ELEMENTS COLLECTED | |
For Each HTMLDocElement In HTMLDocElements | |
'COLLECT THE EXTRACTED REVIEW ID | |
REVIEW_IDS.Add HTMLDocElement.getAttribute("id") | |
Debug.Print HTMLDocElement.getAttribute("id") | |
Next HTMLDocElement | |
End If | |
IE.Quit | |
'START OF US REVIEWS CRAWL | |
'CONVERT THE INITIAL URL ABOVE AND CONVERTS IT TO THE PRODUCT REVIEW PAGE 1 | |
'THIS IS NEEDED TO CAPTURE EXPECTED NUMBER REVIEW PAGES | |
href1 = GETPAGEREVIEWSURL(href, 1) | |
IE2.navigate href1 | |
IE2.Visible = False | |
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO | |
Do While IE2.readyState <> READYSTATE_COMPLETE | |
Application.StatusBar = "Loading page..." | |
Loop | |
Set HTMLDoc2 = IE2.document | |
'GET THE EXPECTED NUMBER OF US REVIEW PAGES | |
Set HTMLDocElements = HTMLDoc2.getElementsByClassName("a-size-base") | |
For Each HTMLDocElement In HTMLDocElements | |
Dim x_test As String | |
x = HTMLDocElement.getAttribute("data-hook") | |
If HTMLDocElement.getAttribute("data-hook") = "cr-filter-info-review-count" Then | |
US_TOTAL_PAGE_REVIEWS = GETREVIEWSPAGESCOUNT(HTMLDocElement) | |
Exit For | |
End If | |
Next HTMLDocElement | |
'CRAWL THE FIRST 10 US REVIEWS | |
'CAPTURE THE ELEMENT WHERE THE FIRST 10 US REVIEW IDS ARE CONTAINED | |
Set HTMLDocElements = HTMLDoc2.getElementsByClassName("a-section review aok-relative") | |
Debug.Print vbCr & "First 10 US Reviews" & vbCr & "-----------------------------" | |
Application.StatusBar = "Extracting First 10 US Review ID's..." | |
'ITERATE EACH HTML ELEMENTS COLLECTED | |
For Each HTMLDocElement In HTMLDocElements | |
'COLLECT THE EXTRACTED REVIEW ID | |
REVIEW_IDS.Add HTMLDocElement.getAttribute("id") | |
Debug.Print HTMLDocElement.getAttribute("id") | |
Next HTMLDocElement | |
'TERMINATES IE | |
IE2.Quit | |
'CRAWL THE NEXT REVIEW PAGES FOR ITS REVIEW IDS | |
'GET THE US REVIEW IDS | |
For i = 2 To US_TOTAL_PAGE_REVIEWS | |
'TRANSFORMS THE INITIAL URL TO MATCH EACH REVIEW PAGE URL | |
href1 = GETPAGEREVIEWSURL(href, i) | |
Debug.Print vbCr & "Crawling Page " & i & "..." & vbCr & "-----------------------------" | |
Dim IE3 As New SHDocVw.InternetExplorer | |
IE3.navigate href1 ' | |
IE3.Visible = False | |
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO | |
Do While IE3.readyState <> READYSTATE_COMPLETE | |
Application.StatusBar = "Loading Review pages..." | |
Loop | |
Set HTMLDoc3 = IE3.document | |
'COLLECTS THE ELEMENT WITH CLASS NAME THAT CONTAINTS THE REVIEW ID | |
Set HTMLDocElements = HTMLDoc3.getElementsByClassName("a-section review aok-relative") | |
Application.StatusBar = "Extracting Rest of the US Review ID's..." | |
'ITERATES EACH ELEMENTS | |
For Each HTMLDocElement In HTMLDocElements | |
REVIEW_IDS.Add HTMLDocElement.getAttribute("id") | |
Debug.Print HTMLDocElement.getAttribute("id") | |
Next HTMLDocElement | |
Next i | |
IE3.Quit | |
Set GETREVIEWIDS = REVIEW_IDS | |
End Function | |
Function GETTHEREVIEWS(href As String) As Object | |
Dim PROD_AMAZONPARAM As New Collection | |
Dim REVIEW_IDS As New Collection | |
Dim REVIEW_ID As Variant | |
Dim ASIN As String | |
Dim PROD_DESC As String | |
Dim DATABASE_PATH As String | |
Dim i As Integer | |
DATABASE_PATH = GET_USERDBPATH | |
Set REVIEW_IDS = GETREVIEWIDS(href) | |
Set PROD_AMAZONPARAM = GETPROD_AMAZONPARAM(href) | |
ASIN = PROD_AMAZONPARAM(1) | |
'Product Description as stated in the href | |
PROD_DESC = PROD_AMAZONPARAM(2) | |
Debug.Print vbCr & "Extracting the Reviews Info..." & vbCr & "-----------------------------" | |
i = 1 | |
For Each REVIEW_ID In REVIEW_IDS | |
Dim href1 As String | |
Dim IE As New SHDocVw.InternetExplorer | |
Dim HTMLDoc As MSHTML.HTMLDocument | |
Dim HTMLDocElements As MSHTML.IHTMLElementCollection | |
Dim HTMLDocElement As MSHTML.IHTMLElement | |
Dim REVIEW_URL As String | |
Dim REVIEWER_PROFILE_NAME As String | |
Dim REVIEW_SCORE As Integer | |
Dim REVIEW_TIMING As New Collection | |
Dim REVIEW_DATE As String | |
Dim REVIEW_COUNTRY As String | |
Dim PRODUCT_MODEL As String | |
Dim BADGE_INFO As String | |
Dim REVIEW_TITLE As String | |
Dim REVIEW_COMMENT As String | |
Dim UPVOTES As Integer | |
Dim REVIEW_NUM_OF_COMMENTS As Integer | |
Dim DID_MANUFACTURER_RESPONDED As String | |
Dim MANUFACTURERS_RESPONSE As String | |
Dim MANUFACTURERS_REPLIES As New Collection | |
UPVOTES = 0 | |
REVIEW_NUM_OF_COMMENTS = 0 | |
DID_MANUFACTURER_RESPONDED = "N" | |
MANUFACTURERS_RESPONSE = vbNullString | |
REVIEW_URL = GETREVIEWURL(ASIN, CStr(REVIEW_ID)) | |
'OPEN THE REVIEW URL | |
IE.navigate REVIEW_URL | |
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO | |
Do While IE.readyState <> READYSTATE_COMPLETE | |
Application.StatusBar = "Loading individual review page..." | |
Loop | |
Set HTMLDoc = IE.document | |
'GET THE REVIEW PAGE ESSENTIAL INFO | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE PROFILE NAME IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-profile-name") | |
REVIEWER_PROFILE_NAME = HTMLDocElements(0).innerText | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE SCORE IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-icon-alt") | |
REVIEW_SCORE = GETREVIEWSCORE(HTMLDocElements(0).innerText) | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE REVIEW TITLE IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold") | |
REVIEW_TITLE = HTMLDocElements(0).innerText | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE REVIEW DATE AND COUNTRY IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base a-color-secondary review-date") | |
Set REVIEW_TIMING = GETREVIEWCOUNTRY_DATE(HTMLDocElements(0).innerText) | |
REVIEW_DATE = REVIEW_TIMING.Item(1) | |
REVIEW_COUNTRY = REVIEW_TIMING.Item(2) | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE PRODUCT MODEL IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-mini a-link-normal a-color-secondary") | |
If HTMLDocElements.Length <> 0 Then PRODUCT_MODEL = HTMLDocElements(0).innerText | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE PROFILE BADGE IS CONTAINED | |
'CHECK FOR AVP-BADGE = Verified Purchase | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-mini a-color-state a-text-bold") | |
'CHECK FOR VINE VOICE INFO = Vine Customer Review of Free Product | |
If HTMLDocElements.Length = 0 Then Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-color-success a-text-bold") | |
'CHEDK IF THERE ARE BADGES PRESENT | |
If HTMLDocElements.Length = 0 Then | |
BADGE_INFO = "" | |
Else | |
BADGE_INFO = HTMLDocElements(0).innerText | |
End If | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE REVIEW TEXT IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base review-text review-text-content") | |
REVIEW_COMMENT = WorksheetFunction.Trim(WorksheetFunction.Clean(HTMLDocElements(0).innerText)) | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE UPVOTE INFO IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base a-color-tertiary cr-vote-text") | |
'CHECK IF THERE ARE REVIEWS UPVOTES | |
If HTMLDocElements.Length <> 0 Then UPVOTES = GETUPVOTECOUNT(HTMLDocElements(0).innerText) | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE NUMBER OF COMMENTS IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("review-comment-total aok-hidden") | |
If HTMLDocElements.Length <> 0 Then REVIEW_NUM_OF_COMMENTS = CInt(HTMLDocElements(0).innerText) | |
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE MANUFACTURER'S COMMENTS/REPLY IS CONTAINED | |
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-box a-spacing-large official-comment-container") | |
If HTMLDocElements.Length <> 0 Then | |
For Each HTMLDocElement In HTMLDocElements | |
MANUFACTURERS_REPLIES.Add HTMLDocElement.innerText | |
Next HTMLDocElement | |
End If | |
If MANUFACTURERS_REPLIES.Count <> 0 Then | |
DID_MANUFACTURER_RESPONDED = "Y" | |
MANUFACTURERS_RESPONSE = MANUFACTURERS_REPLIES(1) | |
End If | |
Application.StatusBar = "Extracting Reviews by ID " & Format(i / REVIEW_IDS.Count, "0%") & " complete" | |
Debug.Print ASIN, PROD_DESC, PRODUCT_MODEL, REVIEW_ID, REVIEW_URL, REVIEW_COUNTRY, REVIEW_DATE, REVIEWER_PROFILE_NAME, BADGE_INFO, REVIEW_SCORE, REVIEW_TITLE, UPVOTES, REVIEW_COMMENT, REVIEW_NUM_OF_COMMENTS, DID_MANUFACTURER_RESPONDED, MANUFACTURERS_RESPONSE | |
Dim REVIEW_DATA As New Collection | |
REVIEW_DATA.Add ASIN | |
REVIEW_DATA.Add PROD_DESC | |
REVIEW_DATA.Add PRODUCT_MODEL | |
REVIEW_DATA.Add REVIEW_ID | |
REVIEW_DATA.Add REVIEW_URL | |
REVIEW_DATA.Add REVIEW_COUNTRY | |
REVIEW_DATA.Add REVIEW_DATE | |
REVIEW_DATA.Add REVIEWER_PROFILE_NAME | |
REVIEW_DATA.Add BADGE_INFO | |
REVIEW_DATA.Add REVIEW_SCORE | |
REVIEW_DATA.Add REVIEW_TITLE | |
REVIEW_DATA.Add UPVOTES | |
REVIEW_DATA.Add REVIEW_COMMENT | |
REVIEW_DATA.Add REVIEW_NUM_OF_COMMENTS | |
REVIEW_DATA.Add DID_MANUFACTURER_RESPONDED | |
REVIEW_DATA.Add MANUFACTURERS_RESPONSE | |
WRITETOSHEET REVIEW_DATA | |
Set REVIEW_DATA = Nothing | |
i = i + 1 | |
Next REVIEW_ID | |
IE.Quit | |
End Function | |
Sub WRITETOSHEET(passedData As Object) | |
For i = 1 To passedData.Count | |
ActiveCell.Value = passedData(i) | |
ActiveCell.Offset(0, 1).Select | |
Next i | |
ActiveCell.Offset(1, (passedData.Count) * -1).Select | |
End Sub | |
Sub GETALLREVIEWS() | |
Dim prodUrl As String | |
prodUrl = InputBox("Please input product URL", "Scraping URL") | |
If Len(prodUrl) = 0 Then | |
MsgBox "You have not provided a valide url", vbCritical | |
Exit Sub | |
End If | |
Dim ws As Worksheet | |
Sheets.Add After:=ActiveSheet | |
Set ws = ActiveSheet | |
GETTHEREVIEWS prodUrl | |
End Sub |
No comments:
Post a Comment