# Lecture 22 library(tidyverse) library(rvest) library(RSelenium) library(netstat) # Scraping Amazon Reviews ---------------------------------- rD <- rsDriver(browser = "firefox", verbose = FALSE, port = free_port(), chromever = NULL) remDr <- rD[["client"]] url <- "https://www.amazon.co.uk/product-reviews/B0050QB3EQ/" remDr$navigate(url) # identify email field field <- remDr$findElement(using = "css", "#ap_email") email = "" field$sendKeysToElement(list(email)) # click on the tab click <- remDr$findElement(using = "css", "#continue") click$clickElement() # identify pwd field field <- remDr$findElement(using = "css", "#ap_password") pwd = "" field$sendKeysToElement(list(pwd)) # click on the tab click <- remDr$findElement(using = "css", "#signInSubmit") click$clickElement() folder <- "Amazon/" dir.create(folder) names <- str_c("Amazon", seq(1, 10, 1), ".html") names output <- remDr$getPageSource(header = TRUE) write(output[[1]], file = str_c(folder, names[1])) for (i in 2:length(names)) { if (!file.exists(str_c(folder, names[i]))) { # identify and click the next button button <- remDr$findElement(using = "css", value = ".a-last") button$clickElement() Sys.sleep(3) output <- remDr$getPageSource(header = TRUE) write(output[[1]], file = str_c(folder, names[i])) Sys.sleep(5) } } # close the connection remDr$close() rD$server$stop() # Function to scrape amazon_reviews <- function(id, page){ file <- str_c(folder, "Amazon", page, ".html") html <- read_html(file, encoding = "utf-8") # Review title (UK and not-UK) title=html |> html_elements("") |> html_elements("")|> html_text2() title=title |> c(html |> html_elements("")|> html_text(trim = T)) # Review text (the same for UK and not-UK) text=html |> html_elements("") |> html_text(trim = T) # Review stars (UK and not-UK) star=html |> html_elements("") |> html_text2() star=star |> c( html |> html_elements("") |> html_text2()) t <- tibble(title, text, star, page = page) return(t) } # Define the product and the number of pages id="B0050QB3EQ" page=1:10 library(purrr) data=map_df(page,~amazon_reviews(id, page = .)) View(data) # add a doc_id and save data = data |> mutate(id = seq_along(text)) save("data", file = "data.rds") #Check the language install.packages("cld2") library(cld2) data$title_lang=detect_language(data$title) data$text_lang=detect_language(data$text) table(Text=data$text_lang,Title=data$title_lang,useNA="always") data=data |> filter (text_lang=="en") # Extract the star data = data |> mutate(score=as.numeric(str_sub(star,1,1))) glimpse(data) # analyse the score data |> summarise(mean(score), min(score), median(score), max(score)) #summary(data$score) data |> count(score) |> mutate(p=round(n/sum(n),2)) data |> ggplot(aes(x=score)) + geom_bar(fill="steelblue")+ labs(title="Amazon reviews' stars", subtitle = "The Theory That Would Not Die, by Mcgrayne", x ="Stars", y = "Number of comments")+ theme_bw()+ theme(plot.title = element_text(color = "steelblue", size = 12, face = "bold"), plot.subtitle = element_text(color = "steelblue2"))