# Lecture 22
library(tidyverse)
library(rvest)
library(RSelenium)
library(netstat)

# Scraping Amazon Reviews  ----------------------------------

rD <- rsDriver(browser = "firefox",
               verbose = FALSE,
               port = free_port(),
               chromever = NULL)

remDr <- rD[["client"]]
url <- "https://www.amazon.co.uk/product-reviews/B0050QB3EQ/"
remDr$navigate(url)

# identify email field
field <- remDr$findElement(using = "css", "#ap_email")
email = ""

field$sendKeysToElement(list(email))

# click on the tab 
click <- remDr$findElement(using = "css", "#continue")
click$clickElement()

# identify pwd field
field <- remDr$findElement(using = "css", "#ap_password")
pwd = ""

field$sendKeysToElement(list(pwd))

# click on the tab 
click <- remDr$findElement(using = "css", "#signInSubmit")
click$clickElement()

folder <- "Amazon/"
dir.create(folder)
names <- str_c("Amazon", seq(1, 10, 1), ".html")
names
output <- remDr$getPageSource(header = TRUE)
write(output[[1]], file = str_c(folder,
                                names[1]))



for (i in 2:length(names)) {
  if (!file.exists(str_c(folder, names[i]))) {
  # identify and click the next button
  button <- remDr$findElement(using = "css", value = ".a-last")
  button$clickElement()
  Sys.sleep(3)
  output <- remDr$getPageSource(header = TRUE)
  write(output[[1]], file = str_c(folder,
                               names[i]))
  Sys.sleep(5)
  }
}

# close the connection
remDr$close()
rD$server$stop()


# Function to scrape

amazon_reviews <- function(id, page){
  
  file <- str_c(folder, "Amazon", page, ".html")
  html <- read_html(file, encoding = "utf-8")                
# Review title (UK and not-UK)
title=html |> 
  html_elements("") |>
  html_elements("")|>
  html_text2()

title=title |> c(html |> 
                 html_elements("")|>
                 html_text(trim = T))


# Review text (the same for UK and not-UK)
text=html |> 
  html_elements("") |>
  html_text(trim = T)

# Review stars (UK and not-UK)
star=html |>
  html_elements("") |>
  html_text2()

star=star |> c(
  html |> 
    html_elements("") |>
    html_text2())

t <- tibble(title,
       text,
       star,
       page = page)
return(t)

}

# Define the product and the number of pages

id="B0050QB3EQ"
page=1:10
library(purrr)
data=map_df(page,~amazon_reviews(id, page = .)) 

View(data)

# add a doc_id and save
data = data |>
  mutate(id = seq_along(text))
save("data", file = "data.rds")

#Check the language 
install.packages("cld2")
library(cld2)

data$title_lang=detect_language(data$title)
data$text_lang=detect_language(data$text)


table(Text=data$text_lang,Title=data$title_lang,useNA="always")


data=data |> 
  filter (text_lang=="en")



# Extract the star
data = data |> 
  mutate(score=as.numeric(str_sub(star,1,1)))
glimpse(data)


# analyse the score
data |> 
  summarise(mean(score), min(score), median(score), max(score))
#summary(data$score)

data |> count(score) |> 
  mutate(p=round(n/sum(n),2))

data |> 
  ggplot(aes(x=score)) + 
  geom_bar(fill="steelblue")+
  labs(title="Amazon reviews' stars",
       subtitle = "The Theory That Would Not Die, by Mcgrayne",
       x ="Stars", 
       y = "Number of comments")+
  theme_bw()+
  theme(plot.title = element_text(color = "steelblue", 
                                  size = 12,
                                  face = "bold"),
        plot.subtitle = element_text(color = "steelblue2"))