Rvest: Scrape Multiple Urls

Rvest: Scrape multiple URLs

Here's one approach using purrr and rvest. The key idea is to save the parsed page, and then extract the bits you're interested in.

library(rvest)
library(purrr)

topmovies <- read_html("http://www.imdb.com/chart/top")
links <- topmovies %>%
html_nodes(".titleColumn") %>%
html_nodes("a") %>%
html_attr("href") %>%
xml2::url_absolute("http://imdb.com") %>%
.[1:5] # for testing

pages <- links %>% map(read_html)

title <- pages %>%
map_chr(. %>%
html_nodes("h1") %>%
html_text()
)
rating <- pages %>%
map_dbl(. %>%
html_nodes("strong span") %>%
html_text() %>%
as.numeric()
)

Scrape multiple URLs with rvest

You can use map (or in base R: lapply) to loop through every url element; here is an example

url <- c("https://www.vox.com/", "https://www.bbc.com/")
page <-map(url, ~read_html(.x) %>% html_nodes("p") %>% html_text())
str(page)
#List of 2
# $ : chr [1:22] "But he was acquitted on the two most serious charges he faced." "Health experts say it’s time to prepare for worldwide spread on all continents." "Wall Street is waking up to the threat of coronavirus as fears about the disease and its potential global econo"| __truncated__ "Johnson, who died Monday at age 101, did groundbreaking work in helping return astronauts safely to Earth." ...
# $ : chr [1:19] "" "\n The ex-movie mogul is handcuffed and led from cou"| __truncated__ "" "27°C" ...

The return object is a list.

PS. I've changed the second url element because "https://www.cnn.com/" returned NULL for html_nodes("p") %>% html_text().

scrape multiple urls from a csv file with R

Try This:

library(rvest)

URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()

for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])

}

df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)

for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}

View(df)

Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.

Sample Image

Adding 'html_attr' to function scraping multiple urls using rvest

Figured it out by updating the function. Updated code below.

scrape_change_page <- function(url)
{
webpage <- xml2::read_html(url)

get_text <- function(css)
{
vec <- rvest::html_text(rvest::html_nodes(webpage, css), trim = TRUE)
if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
}

get_attr <- function(css, attr)
{
vec <- rvest::html_attr(rvest::html_nodes(webpage, css), attr)
if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
}

dplyr::tibble(
title = get_text('.xs-mbs'),
date = gsub("Created", "", get_text('.symbol-clock+ span')),
supporters = gsub(" supporters", "", get_text('.symbol-supporters+ span')),
addressee = gsub("Petition to ", "", get_text('.xs-mbn .type-s')),
location = get_text('.plxxs'),
link = get_attr('.search-results .list-rule a.link-block.js-click-search-result', 'href')
)
}

Scraping multiple sub-pages of multiple URLs

You could construct link to all the URL's using outer :

all_links <- c(t(outer(df$PostURL, paste0('&page=', 1:3), paste0)))
all_links

# [1] "www.abc.com/2315Azxc&page=1" "www.abc.com/2315Azxc&page=2" "www.abc.com/2315Azxc&page=3"
# [4] "www.abc.com/1478Bnbx&page=1" "www.abc.com/1478Bnbx&page=2" "www.abc.com/1478Bnbx&page=3"
# [7] "www.abc.com/6734Gytr&page=1" "www.abc.com/6734Gytr&page=2" "www.abc.com/6734Gytr&page=3"
#[10] "www.abc.com/8912Jqwe&page=1" "www.abc.com/8912Jqwe&page=2" "www.abc.com/8912Jqwe&page=3"

Now you can use the same lapply code to scrape each page.

data

df <- structure(list(PostURL = c("www.abc.com/2315Azxc", "www.abc.com/1478Bnbx", 
"www.abc.com/6734Gytr", "www.abc.com/8912Jqwe")),
class = "data.frame", row.names = c(NA, -4L))

Using Rvest to scrape text, table, and combine the two from multiple pages

Consider this approach. We only need to use html_node because your code suggests that there is only one table per page to scrape.

library(tidyverse)
library(rvest)

get_title <- . %>% html_node(xpath = '//*[@id="block-zircon-content"]/a[2]') %>% html_text()
get_table <- . %>% html_node(xpath = '//*[@id="block-zircon-content"]/table') %>% html_table()

urls <- paste0("https://lsgkerala.gov.in/en/lbelection/electdmemberdet/2010/", 225:227)

tibble(urls) %>%
mutate(
page = map(urls, read_html),
newcol = map_chr(page, get_title),
data = map(page, get_table),
page = NULL, urls = NULL
) %>%
unnest(data)

Output

# A tibble: 52 x 7
newcol `Ward No.` `Ward Name` `Elected Members` Role Party Reservation
<chr> <int> <chr> <chr> <chr> <chr> <chr>
1 Thiruvananthapuram - Chemmaruthy Grama Panchayat 1 VANDIPPURA BABY P Member CPI(M) Woman
2 Thiruvananthapuram - Chemmaruthy Grama Panchayat 2 PALAYAMKUNNU SREELATHA D Member INC Woman
3 Thiruvananthapuram - Chemmaruthy Grama Panchayat 3 KOVOOR KAVITHA V Member INC Woman
4 Thiruvananthapuram - Chemmaruthy Grama Panchayat 4 SIVAPURAM ANIL. V Member INC General
5 Thiruvananthapuram - Chemmaruthy Grama Panchayat 5 MUTHANA JAYALEKSHMI S Member INC Woman
6 Thiruvananthapuram - Chemmaruthy Grama Panchayat 6 MAVINMOODU S SASIKALA NATH Member CPI(M) Woman
7 Thiruvananthapuram - Chemmaruthy Grama Panchayat 7 NJEKKADU P.MANILAL Member INC General
8 Thiruvananthapuram - Chemmaruthy Grama Panchayat 8 CHEMMARUTHY SASEENDRA President INC Woman
9 Thiruvananthapuram - Chemmaruthy Grama Panchayat 9 PANCHAYAT OFFICE PRASANTH PANAYARA Member INC General
10 Thiruvananthapuram - Chemmaruthy Grama Panchayat 10 VALIYAVILA SANJAYAN S Member INC General
# ... with 42 more rows


Related Topics



Leave a reply



Submit