Rvest: Scrape Multiple Urls

Rvest: Scrape multiple URLs

Here's one approach using purrr and rvest. The key idea is to save the parsed page, and then extract the bits you're interested in.

library(rvest)
library(purrr)

topmovies <- read_html("http://www.imdb.com/chart/top")
links <- topmovies %>%
  html_nodes(".titleColumn") %>%
  html_nodes("a") %>%
  html_attr("href") %>% 
  xml2::url_absolute("http://imdb.com") %>% 
  .[1:5] # for testing

pages <- links %>% map(read_html)

title <- pages %>% 
  map_chr(. %>% 
    html_nodes("h1") %>% 
    html_text()
  )
rating <- pages %>% 
  map_dbl(. %>% 
    html_nodes("strong span") %>% 
    html_text() %>% 
    as.numeric()
  )

Scrape multiple URLs with rvest

You can use map (or in base R: lapply) to loop through every url element; here is an example

url <- c("https://www.vox.com/", "https://www.bbc.com/")
page <-map(url, ~read_html(.x) %>% html_nodes("p") %>% html_text())
str(page)
#List of 2
# $ : chr [1:22] "But he was acquitted on the two most serious charges he faced." "Health experts say it’s time to prepare for worldwide spread on all continents." "Wall Street is waking up to the threat of coronavirus as fears about the disease and its potential global econo"| __truncated__ "Johnson, who died Monday at age 101, did groundbreaking work in helping return astronauts safely to Earth." ...
# $ : chr [1:19] "" "\n                                                            The ex-movie mogul is handcuffed and led from cou"| __truncated__ "" "27°C" ...

The return object is a list.

PS. I've changed the second url element because "https://www.cnn.com/" returned NULL for html_nodes("p") %>% html_text().

scrape multiple urls from a csv file with R

Try This:

library(rvest)

URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()

for (i in 1:n) {
  URLs2[i]<-as.character(URLs[i,1])

}

df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)

for (i in 1:n) {
  webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
  if (!"empty page" %in% webpage) {
  title_data_html <- html_nodes(webpage,'.u-h1')
  title_data <- html_text(title_data_html)
  abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
  abstract_data <- html_text(abstract_data_html)
  temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
  if(ncol(temp)==3) {
    df <- rbind(df,temp)
  }
}
}

View(df)

Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.

Sample Image

Adding 'html_attr' to function scraping multiple urls using rvest

Figured it out by updating the function. Updated code below.

scrape_change_page <- function(url)
    {
      webpage  <- xml2::read_html(url)
      
      get_text <- function(css) 
      {
        vec <- rvest::html_text(rvest::html_nodes(webpage, css), trim = TRUE)
        if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
      }
    
      get_attr <- function(css, attr)
      {
        vec <- rvest::html_attr(rvest::html_nodes(webpage, css), attr)
        if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
      }
      
      dplyr::tibble(
        title         = get_text('.xs-mbs'),
        date          = gsub("Created", "", get_text('.symbol-clock+ span')),
        supporters    = gsub(" supporters", "", get_text('.symbol-supporters+ span')),
        addressee     = gsub("Petition to ", "", get_text('.xs-mbn .type-s')),
        location      = get_text('.plxxs'),
        link          = get_attr('.search-results .list-rule a.link-block.js-click-search-result', 'href')
        )
    }

Scraping multiple sub-pages of multiple URLs

You could construct link to all the URL's using outer :

all_links <- c(t(outer(df$PostURL, paste0('&page=', 1:3), paste0)))
all_links

# [1] "www.abc.com/2315Azxc&page=1" "www.abc.com/2315Azxc&page=2" "www.abc.com/2315Azxc&page=3"
# [4] "www.abc.com/1478Bnbx&page=1" "www.abc.com/1478Bnbx&page=2" "www.abc.com/1478Bnbx&page=3"
# [7] "www.abc.com/6734Gytr&page=1" "www.abc.com/6734Gytr&page=2" "www.abc.com/6734Gytr&page=3"
#[10] "www.abc.com/8912Jqwe&page=1" "www.abc.com/8912Jqwe&page=2" "www.abc.com/8912Jqwe&page=3"

Now you can use the same lapply code to scrape each page.

data

df <- structure(list(PostURL = c("www.abc.com/2315Azxc", "www.abc.com/1478Bnbx", 
"www.abc.com/6734Gytr", "www.abc.com/8912Jqwe")), 
class = "data.frame", row.names = c(NA, -4L))

Using Rvest to scrape text, table, and combine the two from multiple pages

Consider this approach. We only need to use html_node because your code suggests that there is only one table per page to scrape.

library(tidyverse)
library(rvest)

get_title <- . %>% html_node(xpath = '//*[@id="block-zircon-content"]/a[2]') %>% html_text()
get_table <- . %>% html_node(xpath = '//*[@id="block-zircon-content"]/table') %>% html_table()

urls <- paste0("https://lsgkerala.gov.in/en/lbelection/electdmemberdet/2010/", 225:227)

tibble(urls) %>% 
  mutate(
    page = map(urls, read_html), 
    newcol = map_chr(page, get_title), 
    data = map(page, get_table), 
    page = NULL, urls = NULL
  ) %>% 
  unnest(data)

Output

# A tibble: 52 x 7
   newcol                                           `Ward No.` `Ward Name`      `Elected Members` Role      Party  Reservation
   <chr>                                                 <int> <chr>            <chr>             <chr>     <chr>  <chr>      
 1 Thiruvananthapuram - Chemmaruthy Grama Panchayat          1 VANDIPPURA       BABY P            Member    CPI(M) Woman      
 2 Thiruvananthapuram - Chemmaruthy Grama Panchayat          2 PALAYAMKUNNU     SREELATHA D       Member    INC    Woman      
 3 Thiruvananthapuram - Chemmaruthy Grama Panchayat          3 KOVOOR           KAVITHA V         Member    INC    Woman      
 4 Thiruvananthapuram - Chemmaruthy Grama Panchayat          4 SIVAPURAM        ANIL. V           Member    INC    General    
 5 Thiruvananthapuram - Chemmaruthy Grama Panchayat          5 MUTHANA          JAYALEKSHMI S     Member    INC    Woman      
 6 Thiruvananthapuram - Chemmaruthy Grama Panchayat          6 MAVINMOODU       S SASIKALA NATH   Member    CPI(M) Woman      
 7 Thiruvananthapuram - Chemmaruthy Grama Panchayat          7 NJEKKADU         P.MANILAL         Member    INC    General    
 8 Thiruvananthapuram - Chemmaruthy Grama Panchayat          8 CHEMMARUTHY      SASEENDRA         President INC    Woman      
 9 Thiruvananthapuram - Chemmaruthy Grama Panchayat          9 PANCHAYAT OFFICE PRASANTH PANAYARA Member    INC    General    
10 Thiruvananthapuram - Chemmaruthy Grama Panchayat         10 VALIYAVILA       SANJAYAN S        Member    INC    General    
# ... with 42 more rows

Rvest: Scrape Multiple Urls