Thursday, July 16, 2020

VBA: Here's How to Crawl an Amazon Product Reviews


Web Scraping, a clever approach to extract information from layers and layers of html of a web page. It's a a crude way and an alternative to some developers who don't the right access to an API. I'm sharing to an approach if you need to scrape an Amazon review of a product.

The script that I'm going to share is good only for single product URL scraping. This is to demonstrate how VBA can still be effective for those low tier automation implementation. Basically, applicable to developers still in exploring phase, students and hobbyist just want to have some fun.

I've taken a simple approach and haven't taken time to document. This is a project just came to at instance decided to write this in Excel VBA. Depending on the number reviews in that product, the script will take 2~3 mins to run. Let me know in the comments below your thoughts. 

To run the code, you need to select "GETALLREVIEWS" in your macro's list and you will be prompted with an input box where you can paste the URL of the target product listed in Amazon. The script will return to you a new spreadsheet with the list of the reviews successfully scraped.

The programming language used in this application is VBA(Visual Basic for Application). If you want to learn this coding technique, click Buy Now button below for a book to guide you with your learning.
Here's the script. Have fun
Attribute VB_Name = "Amazon_Reviews_Scrape"
Function GETPAGEREVIEWSURL(href As String, pageNum As Variant) As String
Dim splitter As Variant
splitter = Split(href, "/")
splitter(4) = "product-reviews"
If pageNum = 1 Then
splitter(6) = "ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
ElseIf pageNum = 2 Then
splitter(6) = "ref=cm_cr_arp_d_paging_btm_next_" & pageNum & "?ie=UTF8&reviewerType=all_reviews&pageNumber=" & pageNum
Else
splitter(6) = "ref=cm_cr_getr_d_paging_btm_next_" & pageNum & "?ie=UTF8&reviewerType=all_reviews&pageNumber=" & pageNum
End If
GETPAGEREVIEWSURL = Join(splitter, "/")
End Function
Function GETPROD_AMAZONPARAM(href As String) As Object
Dim param As New Collection
Dim splitter As Variant
splitter = Split(href, "/")
param.Add splitter(5)
param.Add splitter(3)
Set GETPROD_AMAZONPARAM = param
End Function
Function GETREVIEWURL(ASIN As String, ID As String) As String
GETREVIEWURL = "https://www.amazon.com/gp/customer-reviews/" & ID & "/ref=cm_cr_arp_d_rvw_ttl?ie=UTF8&ASIN=" & ASIN
End Function
Function GETREVIEWSCORE(score_string As String) As Integer
Dim splitter As Variant
splitter = Split(score_string, " out of ")
GETREVIEWSCORE = CInt(splitter(0))
End Function
Function GETREVIEWCOUNTRY_DATE(countryDateString As String) As Object
Dim revieTiming As New Collection
Dim splitter As Variant
Dim inTheSplitCheck As Variant
splitter = Split(countryDateString, " on ")
revieTiming.Add splitter(1)
inTheSplitCheck = Split(splitter(0), " in the ")
If UBound(inTheSplitCheck) = 0 Then inTheSplitCheck = Split(splitter(0), " in ")
revieTiming.Add inTheSplitCheck(1)
Set GETREVIEWCOUNTRY_DATE = revieTiming
End Function
Function GETUPVOTECOUNT(upvoteStr As String) As Integer
Dim splitter As Variant
splitter = Split(upvoteStr, " ")
If splitter(0) = "One" Then splitter(0) = 1
GETUPVOTECOUNT = CInt(splitter(0))
End Function
Function GETREVIEWSPAGESCOUNT(el As MSHTML.IHTMLElement) As Integer
Dim el_innerText As String
Dim splitter As Variant
Dim pageCount As Integer
el_innerText = el.innerText
splitter = Split(el_innerText, " of ")
splitter = Split(splitter(1), " ")
pageCount = CInt(splitter(0))
If pageCount Mod 10 > 0 Then pageCount = ((pageCount / 10) - (pageCount Mod 10 / 10)) + 1
GETREVIEWSPAGESCOUNT = pageCount
End Function
Function GETREVIEWIDS(href As String) As Object
Dim href1 As String
Dim IE As New SHDocVw.InternetExplorer, IE2 As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument, HTMLDoc2 As MSHTML.HTMLDocument, HTMLDoc3 As MSHTML.HTMLDocument
Dim HTMLDocElements As MSHTML.IHTMLElementCollection
Dim HTMLDocElement As MSHTML.IHTMLElement
Dim REVIEW_IDS As New Collection
Dim US_TOTAL_PAGE_REVIEWS As Integer
'THIS IS THE INITIAL URL CRAWL OF THE MAIN PRODUCT PAGE
IE.navigate href
IE.Visible = False
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO
Do While IE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading page main project page..."
Loop
Set HTMLDoc = IE.document
'GET THE INTERNATIONAL REVIEWS ID
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE INTERNATIONAL REVIEWS ARE CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-section review aok-relative cr-desktop-review-page-0")
If HTMLDocElements.Length <> 0 Then
Debug.Print vbCr & "International Reviews" & vbCr & "-----------------------------"
Application.StatusBar = "Extracting International Review IDs..."
'ITERATE EACH HTML ELEMENTS COLLECTED
For Each HTMLDocElement In HTMLDocElements
'COLLECT THE EXTRACTED REVIEW ID
REVIEW_IDS.Add HTMLDocElement.getAttribute("id")
Debug.Print HTMLDocElement.getAttribute("id")
Next HTMLDocElement
End If
'CAPTURE THE HIDDEN FRAMES WHERE SOME OF THE INTERNATIONAL REVIEWS ARE CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-section review aok-relative cr-desktop-review-page-1 aok-hidden")
If HTMLDocElements.Length <> 0 Then
Debug.Print vbCr & "Hidden International Reviews" & vbCr & "-----------------------------"
Application.StatusBar = "Extracting Hidden International Review IDs..."
'ITERATE EACH HTML ELEMENTS COLLECTED
For Each HTMLDocElement In HTMLDocElements
'COLLECT THE EXTRACTED REVIEW ID
REVIEW_IDS.Add HTMLDocElement.getAttribute("id")
Debug.Print HTMLDocElement.getAttribute("id")
Next HTMLDocElement
End If
IE.Quit
'START OF US REVIEWS CRAWL
'CONVERT THE INITIAL URL ABOVE AND CONVERTS IT TO THE PRODUCT REVIEW PAGE 1
'THIS IS NEEDED TO CAPTURE EXPECTED NUMBER REVIEW PAGES
href1 = GETPAGEREVIEWSURL(href, 1)
IE2.navigate href1
IE2.Visible = False
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO
Do While IE2.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading page..."
Loop
Set HTMLDoc2 = IE2.document
'GET THE EXPECTED NUMBER OF US REVIEW PAGES
Set HTMLDocElements = HTMLDoc2.getElementsByClassName("a-size-base")
For Each HTMLDocElement In HTMLDocElements
Dim x_test As String
x = HTMLDocElement.getAttribute("data-hook")
If HTMLDocElement.getAttribute("data-hook") = "cr-filter-info-review-count" Then
US_TOTAL_PAGE_REVIEWS = GETREVIEWSPAGESCOUNT(HTMLDocElement)
Exit For
End If
Next HTMLDocElement
'CRAWL THE FIRST 10 US REVIEWS
'CAPTURE THE ELEMENT WHERE THE FIRST 10 US REVIEW IDS ARE CONTAINED
Set HTMLDocElements = HTMLDoc2.getElementsByClassName("a-section review aok-relative")
Debug.Print vbCr & "First 10 US Reviews" & vbCr & "-----------------------------"
Application.StatusBar = "Extracting First 10 US Review ID's..."
'ITERATE EACH HTML ELEMENTS COLLECTED
For Each HTMLDocElement In HTMLDocElements
'COLLECT THE EXTRACTED REVIEW ID
REVIEW_IDS.Add HTMLDocElement.getAttribute("id")
Debug.Print HTMLDocElement.getAttribute("id")
Next HTMLDocElement
'TERMINATES IE
IE2.Quit
'CRAWL THE NEXT REVIEW PAGES FOR ITS REVIEW IDS
'GET THE US REVIEW IDS
For i = 2 To US_TOTAL_PAGE_REVIEWS
'TRANSFORMS THE INITIAL URL TO MATCH EACH REVIEW PAGE URL
href1 = GETPAGEREVIEWSURL(href, i)
Debug.Print vbCr & "Crawling Page " & i & "..." & vbCr & "-----------------------------"
Dim IE3 As New SHDocVw.InternetExplorer
IE3.navigate href1 '
IE3.Visible = False
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO
Do While IE3.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading Review pages..."
Loop
Set HTMLDoc3 = IE3.document
'COLLECTS THE ELEMENT WITH CLASS NAME THAT CONTAINTS THE REVIEW ID
Set HTMLDocElements = HTMLDoc3.getElementsByClassName("a-section review aok-relative")
Application.StatusBar = "Extracting Rest of the US Review ID's..."
'ITERATES EACH ELEMENTS
For Each HTMLDocElement In HTMLDocElements
REVIEW_IDS.Add HTMLDocElement.getAttribute("id")
Debug.Print HTMLDocElement.getAttribute("id")
Next HTMLDocElement
Next i
IE3.Quit
Set GETREVIEWIDS = REVIEW_IDS
End Function
Function GETTHEREVIEWS(href As String) As Object
Dim PROD_AMAZONPARAM As New Collection
Dim REVIEW_IDS As New Collection
Dim REVIEW_ID As Variant
Dim ASIN As String
Dim PROD_DESC As String
Dim DATABASE_PATH As String
Dim i As Integer
DATABASE_PATH = GET_USERDBPATH
Set REVIEW_IDS = GETREVIEWIDS(href)
Set PROD_AMAZONPARAM = GETPROD_AMAZONPARAM(href)
ASIN = PROD_AMAZONPARAM(1)
'Product Description as stated in the href
PROD_DESC = PROD_AMAZONPARAM(2)
Debug.Print vbCr & "Extracting the Reviews Info..." & vbCr & "-----------------------------"
i = 1
For Each REVIEW_ID In REVIEW_IDS
Dim href1 As String
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim HTMLDocElements As MSHTML.IHTMLElementCollection
Dim HTMLDocElement As MSHTML.IHTMLElement
Dim REVIEW_URL As String
Dim REVIEWER_PROFILE_NAME As String
Dim REVIEW_SCORE As Integer
Dim REVIEW_TIMING As New Collection
Dim REVIEW_DATE As String
Dim REVIEW_COUNTRY As String
Dim PRODUCT_MODEL As String
Dim BADGE_INFO As String
Dim REVIEW_TITLE As String
Dim REVIEW_COMMENT As String
Dim UPVOTES As Integer
Dim REVIEW_NUM_OF_COMMENTS As Integer
Dim DID_MANUFACTURER_RESPONDED As String
Dim MANUFACTURERS_RESPONSE As String
Dim MANUFACTURERS_REPLIES As New Collection
UPVOTES = 0
REVIEW_NUM_OF_COMMENTS = 0
DID_MANUFACTURER_RESPONDED = "N"
MANUFACTURERS_RESPONSE = vbNullString
REVIEW_URL = GETREVIEWURL(ASIN, CStr(REVIEW_ID))
'OPEN THE REVIEW URL
IE.navigate REVIEW_URL
'THIS IS TO MAKES THE PAGE IS LOADED BEFORE SCRAPING FOR INFO
Do While IE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading individual review page..."
Loop
Set HTMLDoc = IE.document
'GET THE REVIEW PAGE ESSENTIAL INFO
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE PROFILE NAME IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-profile-name")
REVIEWER_PROFILE_NAME = HTMLDocElements(0).innerText
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE SCORE IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-icon-alt")
REVIEW_SCORE = GETREVIEWSCORE(HTMLDocElements(0).innerText)
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE REVIEW TITLE IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold")
REVIEW_TITLE = HTMLDocElements(0).innerText
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE REVIEW DATE AND COUNTRY IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base a-color-secondary review-date")
Set REVIEW_TIMING = GETREVIEWCOUNTRY_DATE(HTMLDocElements(0).innerText)
REVIEW_DATE = REVIEW_TIMING.Item(1)
REVIEW_COUNTRY = REVIEW_TIMING.Item(2)
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE PRODUCT MODEL IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-mini a-link-normal a-color-secondary")
If HTMLDocElements.Length <> 0 Then PRODUCT_MODEL = HTMLDocElements(0).innerText
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE PROFILE BADGE IS CONTAINED
'CHECK FOR AVP-BADGE = Verified Purchase
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-mini a-color-state a-text-bold")
'CHECK FOR VINE VOICE INFO = Vine Customer Review of Free Product
If HTMLDocElements.Length = 0 Then Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-color-success a-text-bold")
'CHEDK IF THERE ARE BADGES PRESENT
If HTMLDocElements.Length = 0 Then
BADGE_INFO = ""
Else
BADGE_INFO = HTMLDocElements(0).innerText
End If
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE REVIEW TEXT IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base review-text review-text-content")
REVIEW_COMMENT = WorksheetFunction.Trim(WorksheetFunction.Clean(HTMLDocElements(0).innerText))
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE UPVOTE INFO IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-size-base a-color-tertiary cr-vote-text")
'CHECK IF THERE ARE REVIEWS UPVOTES
If HTMLDocElements.Length <> 0 Then UPVOTES = GETUPVOTECOUNT(HTMLDocElements(0).innerText)
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE NUMBER OF COMMENTS IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("review-comment-total aok-hidden")
If HTMLDocElements.Length <> 0 Then REVIEW_NUM_OF_COMMENTS = CInt(HTMLDocElements(0).innerText)
'CAPTURE THE CLASS NAME OF THE SAMPLE WHERE THE MANUFACTURER'S COMMENTS/REPLY IS CONTAINED
Set HTMLDocElements = HTMLDoc.getElementsByClassName("a-box a-spacing-large official-comment-container")
If HTMLDocElements.Length <> 0 Then
For Each HTMLDocElement In HTMLDocElements
MANUFACTURERS_REPLIES.Add HTMLDocElement.innerText
Next HTMLDocElement
End If
If MANUFACTURERS_REPLIES.Count <> 0 Then
DID_MANUFACTURER_RESPONDED = "Y"
MANUFACTURERS_RESPONSE = MANUFACTURERS_REPLIES(1)
End If
Application.StatusBar = "Extracting Reviews by ID " & Format(i / REVIEW_IDS.Count, "0%") & " complete"
Debug.Print ASIN, PROD_DESC, PRODUCT_MODEL, REVIEW_ID, REVIEW_URL, REVIEW_COUNTRY, REVIEW_DATE, REVIEWER_PROFILE_NAME, BADGE_INFO, REVIEW_SCORE, REVIEW_TITLE, UPVOTES, REVIEW_COMMENT, REVIEW_NUM_OF_COMMENTS, DID_MANUFACTURER_RESPONDED, MANUFACTURERS_RESPONSE
Dim REVIEW_DATA As New Collection
REVIEW_DATA.Add ASIN
REVIEW_DATA.Add PROD_DESC
REVIEW_DATA.Add PRODUCT_MODEL
REVIEW_DATA.Add REVIEW_ID
REVIEW_DATA.Add REVIEW_URL
REVIEW_DATA.Add REVIEW_COUNTRY
REVIEW_DATA.Add REVIEW_DATE
REVIEW_DATA.Add REVIEWER_PROFILE_NAME
REVIEW_DATA.Add BADGE_INFO
REVIEW_DATA.Add REVIEW_SCORE
REVIEW_DATA.Add REVIEW_TITLE
REVIEW_DATA.Add UPVOTES
REVIEW_DATA.Add REVIEW_COMMENT
REVIEW_DATA.Add REVIEW_NUM_OF_COMMENTS
REVIEW_DATA.Add DID_MANUFACTURER_RESPONDED
REVIEW_DATA.Add MANUFACTURERS_RESPONSE
WRITETOSHEET REVIEW_DATA
Set REVIEW_DATA = Nothing
i = i + 1
Next REVIEW_ID
IE.Quit
End Function
Sub WRITETOSHEET(passedData As Object)
For i = 1 To passedData.Count
ActiveCell.Value = passedData(i)
ActiveCell.Offset(0, 1).Select
Next i
ActiveCell.Offset(1, (passedData.Count) * -1).Select
End Sub
Sub GETALLREVIEWS()
Dim prodUrl As String
prodUrl = InputBox("Please input product URL", "Scraping URL")
If Len(prodUrl) = 0 Then
MsgBox "You have not provided a valide url", vbCritical
Exit Sub
End If
Dim ws As Worksheet
Sheets.Add After:=ActiveSheet
Set ws = ActiveSheet
GETTHEREVIEWS prodUrl
End Sub

No comments:

Post a Comment