Scraping with Rvest - Complete with Nas When Tag Is Not Present

Scraping with rvest - complete with NAs when tag is not present

If the tag is not found, rvest returns a character(0). So assuming you will find at most one current and one regular price in each div.product_price, you can use this:

pacman::p_load("rvest", "dplyr")

get_prices <- function(node){
r.precio.antes <- html_nodes(node, 'p.normal_encontrado') %>% html_text
r.precio.actual <- html_nodes(node, 'div.price') %>% html_text

data.frame(
precio.antes = ifelse(length(r.precio.antes)==0, NA, r.precio.antes),
precio.actual = ifelse(length(r.precio.actual)==0, NA, r.precio.actual),
stringsAsFactors=F
)

}

doc <- read_html('test.html') %>% html_nodes("div.product_price")
lapply(doc, get_prices) %>%
rbind_all

Edited: I misunderstood the input data, so changed the script to work with just a single html page.

Scraping with rvest with variable tags

Found the solution:

#read the html
pg <- read_html("url <- https://www.bger.ch/ext/eurospider/live/de/php/aza/http/index.php?lang=de&type=simple_query&query_words=&lang=de&top_subcollection_aza=all&from_date=01.01.2017&to_date=05.01.2017&x=0&y=0")

xdf <- pg %>%
html_nodes("div.ranklist_content ol li") %>% # select enclosing nodes
# iterate over each, pulling out desired parts and coerce to data.frame
map_df(~list(link = html_nodes(.x, ".rank_title a") %>%
html_attr("href") %>%
{if(length(.) == 0) NA else .}, # replace length-0 elements with NA
title = html_nodes(.x, ".rank_title a") %>%
html_text() %>%
{if(length(.) == 0) NA else .},
publication_link = html_nodes(.x, ".published_info a") %>%
html_attr("href") %>%
{if(length(.) == 0) NA else .},

publication = html_nodes(.x, ".published_info a") %>%
html_text() %>%
{if(length(.) == 0) NA else .},

court = html_nodes(.x, ".rank_data .court") %>%
html_text(trim=TRUE) %>%
{if(length(.) == 0) NA else .},

subject = html_nodes(.x, ".rank_data .subject") %>%
html_text(trim=TRUE) %>%
{if(length(.) == 0) NA else .},
object = html_nodes(.x, ".rank_data .object") %>%
html_text(trim=TRUE) %>%
{if(length(.) == 0) NA else .}))

It would be very nice if someone could help me to extract the title of class="published_info small normal".

Inputting NA where there are missing values when scraping with rvest

The simplest way is to select a node that encloses both of the nodes you want for each row, then iterate over them, pulling out both of the nodes you want at once. purrr::map_df is handy for not only iterating, but even combining the results into a nice tibble:

library(rvest)
library(purrr)

url <- "https://channel9.msdn.com/Events/useR-international-R-User-conferences/useR-International-R-User-2017-Conference?sort=status&direction=desc&page=14"

page <- read_html(url)

df <- page %>%
html_nodes('article') %>% # select enclosing nodes
# iterate over each, pulling out desired parts and coerce to data.frame
map_df(~list(title = html_nodes(.x, 'h3 a') %>%
html_text() %>%
{if(length(.) == 0) NA else .}, # replace length-0 elements with NA
length = html_nodes(.x, '.tile .caption') %>%
html_text() %>%
{if(length(.) == 0) NA else .}))

df
#> # A tibble: 12 x 2
#> title length
#> <chr> <chr>
#> 1 Introduction to Natural Language Processing with R II 01:15:00
#> 2 Introduction to Natural Language Processing with R 01:22:13
#> 3 Solving iteration problems with purrr II 01:22:49
#> 4 Solving iteration problems with purrr 01:32:23
#> 5 Markov-Switching GARCH Models in R: The MSGARCH Package 15:55
#> 6 Interactive bullwhip effect exploration using SCperf and Shiny 16:02
#> 7 Actuarial and statistical aspects of reinsurance in R 14:15
#> 8 Transformation Forests 16:19
#> 9 Room 2.02 Lightning Talks 50:35
#> 10 R and Haskell: Combining the best of two worlds 14:45
#> 11 *GNU R* on a Programmable Logic Controller (PLC) in an Embedded-Linux Environment <NA>
#> 12 Performance Benchmarking of the R Programming Environment on Knight's Landing 19:32

How do you scrape items together so you don't lose the index?

The problem you are facing, is not every child node is present in all of the parent nodes. The best way to handle these situations is to collect all parent nodes in a list/vector and then extract the desired information from each parent using the html_node function. html_node will always return 1 result for every node, even if it is NA.

library(rvest)

#read the page just onece
base_url<- "https://www.uchealth.com/providers"
page <- read_html(base_url)

#parse out the parent node for each parent
providers<-page %>% html_nodes('ul[id=providerlist]') %>% html_children()

#parse out the requested information from each child.
dept<-providers %>% html_node("[class ^= 'department']") %>% html_text()
location<-providers %>%html_node('[class=locations]') %>% html_text()

The length of providers, dept and location should all be equal.

Rvest scraping child nodes but filling missing values with NA

If I simply use httr then I can pass in a valid UA header and re-write your code to instead use a data.frame call, instead of list, that way I can return N/A where value not present.

Swap out html_elements for html_element.

You also need to amend your xpaths to avoid getting the first node value repeated for each row.

library(tidyverse)
library(httr)

headers <- c("User-Agent" = "Safari/537.36")

r <- httr::GET(url = "https://www.sec.gov/Archives/edgar/data/1002784/000139834421003391/fp0061633_13fhr-table.xml", httr::add_headers(.headers = headers))

r %>%
content() %>%
html_elements(xpath = "//*[local-name()='infoTable']") %>% # select enclosing nodes
# iterate over each parent node, pulling out desired parts and coerce to data.frame
# not the complete list
map_df(
~ data.frame(
name_of_issuer = html_element(.x, xpath = ".//*[local-name()='nameOfIssuer']") %>%
html_text(),
title_of_class = html_element(.x, xpath = ".//*[local-name()='titleOfClass']") %>%
html_text(),
put_or_call = html_element(.x, xpath = ".//*[local-name()='putCall']") %>%
html_text()
)
)

rvest: Return NAs for empty nodes given multiple listings

The strategy in this solution is to create a list of nodes for each listing node and then search each of those nodes for the desired information, child and view limited.

Using html_node instead of html_nodes will always return a one value (even if it is just NA) this ensures the vector lengths are the same.

Also, with rvest I prefer to use the CSS syntax instead of the xpath. In most cases the CSS is easier to use than the xpath expressions.

library(rvest)

page_html <- read_html(html)
#find the listing nodes and id of each node
listings<-html_nodes(page_html, "div.listing")
listing<-html_attr(listings ,name = "id")

#search each listing node for the child ticket and limit view criteria
child<-sapply(listings, function(x) {html_node(x, "span.listing_sub1") %>% html_text()} )
viewLim<-sapply(listings, function(x) {html_node(x, "span.listing_sub3") %>% html_text()})

#create dataframe
df<-data.frame(listing, child=!is.na(child), viewLim=!is.na(viewLim))

# df
# listing child viewLim
#1 listing_1 FALSE TRUE
#2 listing_2 FALSE FALSE
#3 listing_3 TRUE TRUE

webscraping with rvest: Replace missing values of html_nodes with NA

I found the solution. I used toString to change the xml nodeset to a string, extracted all <div class="publication-editor"> and checked whether each of them have one <span class="publication-editor-affiliation">; when they did not, the combination of lapplyand str_extract led to a NA.

Here is the code, for the record.

    affiliations <- lapply(2:length(titlesnodesnum), function(n){
start<- titlesnodesnum[n-1]+1 #starting node in subcategory
end <- titlesnodesnum [n]-1 #ending node in subcategory
affiliations <- toString(editorsnodes[start:end])
affiliations <- stringr::str_extract_all(affiliations, "(?<=<div class=\"publication-editor\")[\\S\\s]*?(?=<div class=\"clearfix\">)")
affiliations <- lapply(affiliations, function(x) stringr::str_extract(x, "(?<=<span class=\"publication-editor-affiliation\" itemprop=\"affiliation\">).*?(?=</span>)"))
})

Inserting NA in blank values from web scraping

rvest::html_text has an build in trimming option setting trim=TRUE.
After you have done this you can use e.g. ifelse to test for an empty string (=="") or use nzchar.

I full you could do this:

html_nodes(htmlpage, ".meta-wrapper") %>% html_text(trim=TRUE) %>% ifelse(. == "", NA, .)

or this:

res <- html_nodes(htmlpage, ".meta-wrapper") %>% html_text(trim=TRUE)
res[!nzchar(res)] <- NA_character_

@Richard Scriven improvement:

html_nodes(htmlpage, ".meta-wrapper") %>% html_text(trim=TRUE) %>% replace(!nzchar(.), NA)

rvest data scraping replacing missing html_node with NA

You should first extract .s-item__details using html_nodes and from every node extract .NEGATIVE or .s-item__hotness using html_node (without s).

library('rvest')

pages <- 1
page <- 0
output <- data.frame(header = character(), price = character(), runtime = character())
for (i in 1:pages) {

page <- page + 1
link <- paste("https://www.ebay.com/b/Cell-Phones-Smartphones/9355/bn_320094?LH_BIN=1&LH_ItemCondition=1000&rt=nc&_from=R40&_pgn=",page, sep="")

webpage <- read_html(link)


#read the name of the item
header <- html_nodes(webpage, ".s-item__title")
header_text <- html_text(header)

#i get the price
prim_html <- html_nodes(webpage, ".s-item__price")
text_prim <- html_text(prim_html)

price <- rbind(price,as.data.frame(text_prim))

#i get the (amount sold this is missing sometimes)

item <- html_nodes(webpage, ".s-item__details")
runtime_html <- html_node(item, ".s-item__hotness")
text_runtime <- html_text(runtime_html)
text_runtime[is.na(text_runtime)] <- "0"

# combine
out <- data.frame(header_text, text_prim, text_runtime)
output <- rbind(output, out)

#prints 0 so i know that it went throught the for(){}
print(0)

}

output

output

# header_text text_prim text_runtime
# 1 Google Nexus 5X H791 32GB (FACTORY UNLOCKED) 5.2" HD - Mint Green LG $44.88 42 sold
# 2 Motorola Moto Z3 Play 32GB - Unlocked - Deep Indigo - Brand New - XT1929-4 $177.02 7 watching
# 3 LG V20 -Brand New - H915 - Unlocked - Ships Express Canada $162.16 5 watching
# 4 Samsung Galaxy J3 Unlocked 5" 16GB GSM 4G LTE Android Smartphone Black SM-J320W8 $77.89 0
# 5 New ListingSamsung Galaxy A30s SM-A307GN/DS Dual Sim (FACTORY UNLOCKED) 6.4" 64GB 4GB RAM $212.43 0
# ...
# ...
# 42 Black phone 2 - 32GB - Black (Unlocked) Smartphone (Rest of World Version) $318.65 5 watching
# 43 Sagem MC939 $199.00 0
# 44 Nokia 6220 classic $199.00 0
# 45 New ListingSmart Mini Wireless HD Dual WiFi Pocket Projector 2G RAM 16G ROM Android 7.1 $353.35 0
# 46 nokia 7260 $149.25 0
# 47 smartphone $250.00 0


Related Topics



Leave a reply



Submit