# Lecture 22 library(tidyverse) library(rvest) ########### multiple--------------------- url_vol <- "https://www.jstatsoft.org/article/view/v099i" issue <- str_c("0", seq(1, 5, 1)) issue url_list <- str_c(url_vol, issue) url_list names <- str_c("vol99_issue", seq(1, 5, 1), ".html") names[1:5] folder <- "jstat/" dir.create(folder) for (i in 1:length(url_list)) { if (!file.exists(str_c(folder, names[i]))) { download.file(url_list[i], destfile = paste0(folder, names[i])) Sys.sleep(1) } } list.files(folder) list_files_path <- list.files(folder, full.names = TRUE) authors <- character() title <- character() date <- character() abstract <- character() doi <- character() for (i in 1:length(list_files_path)) { html <- read_html(list_files_path[i], encoding = "UTF8") authors[i] <- html |> html_element(css = "") |> html_text2() title[i] <- html |> html_element(css = "") |> html_text2() date[i] <- html |> html_element(css = "") |> html_text2() abstract[i] <- html |> html_element(css = "") |> html_text2() doi[i] <- html |> html_element(css = "") |> html_text2() } articles <- tibble(authors = authors, title = title, date = date, abstract = abstract, doi = doi) View(articles) # Dynamic web pages ---------------------------------- # need to install JAVA https://www.java.com/en/download/ install.packages("RSelenium") library(RSelenium) install.packages("netstat") library(netstat) # German parliament # set up connection and start browser to navigate the page rD <- rsDriver(browser = "firefox", verbose = FALSE, port = free_port(), chromever = NULL) remDr <- rD[["client"]] url <- "https://www.bundestag.de/abgeordnete/" remDr$navigate(url) # identify and click the list button button <- remDr$findElement(using = "css", value = "") button$clickElement() # save the live DOM tree output <- remDr$getPageSource(header = TRUE) write(output[[1]], file = "") # close the connection remDr$close() rD$server$stop() parliament <- read_html("parliament.html", encoding = "utf-8") parliament |> html_elements("") |> html_text2() |> head() ## Pew Research Statistics: Covid news on social media # set up connection and start browser to navigate the page rD <- rsDriver(browser = "firefox", verbose = FALSE, port = free_port(), chromever = NULL) remDr <- rD[["client"]] url <- "https://www.pewresearch.org/pathways-2020/covidthreat_a/political_party/democrat_lean_dem/" remDr$navigate(url) # identify and clear search field field <- remDr$findElement(using = "css", "") field$clearElement() search = "Getting COVID-19 news on social media" field$sendKeysToElement(list(search)) # click on the new tab in order to search click <- remDr$findElement(using = "css", "") click$clickElement() # identify the list of political party css <- "" list <- remDr$findElement(using = "css", value = css) list$clickElement() # and select u.s. adults css <- "" elem <- remDr$findElement(using = "css", value = css) elem$clickElement() # select the table output css <- "" elem <- remDr$findElement(using = "css", value = css) elem$clickElement() # save the output output <- remDr$getPageSource(header = TRUE) write(output[[1]], file = "covid_news.html") # close the connection remDr$close() rD$server$stop() covid_news <- read_html("covid_news.html", encoding = "utf-8") covid_news |> html_elements("table") |> html_table()