Stumped on How to Scrape the Data from This Site (Using R)

stumped on how to scrape the data from this site (using R)

Using Selenium with phantomjs

library(RSelenium)
pJS <- phantom()
remDr <- remoteDriver(browserName = "phantomjs")
appURL <- "http://www.soccer24.com/kosovo/superliga/results/#"
remDr$open()
remDr$navigate(appURL)

if you want to press the more data button until it is not visible (all matches presumed showing):

webElem <- remDr$findElement("css", "#tournament-page-results-more a")
while(webElem$isElementDisplayed()[[1]]){
webElem$clickElement()
Sys.sleep(5)
webElem <- remDr$findElement("css", "#tournament-page-results-more a")
}
doc <- htmlParse(remDr$getPageSource()[[1]])

Remove unwanted round data and use XML::readHTMLTable for simplicity

# remove unwanted rounds html. Sometimes there are end of season extra games.
# These are presented in a seperate table.
invisible(doc["//table/*/tr[@class='event_round']", fun = removeNodes])
appData <- readHTMLTable(doc, which = seq(length(doc["//table"])-1), stringsAsFactors = FALSE, trim = TRUE)
if(!is.data.frame(appData)){appData <- do.call(rbind, appData)}
row.names(appData) <- NULL
names(appData) <- c("blank", "Date", "hteam", "ateam", "score")
pJS$stop()
> head(appData)
blank Date hteam ateam score
1 01.04. 18:00 Ferronikeli Ferizaj 4 : 0
2 01.04. 18:00 Istogu Hajvalia 2 : 1
3 01.04. 18:00 Kosova Vushtrri Trepca Mitrovice 1 : 0
4 01.04. 18:00 Prishtina Drenica 3 : 0
5 31.03. 18:00 Besa Peje Drita 1 : 0
6 31.03. 18:00 Trepca 89 Vellaznimi 2 : 0

> tail(appData)
blank Date hteam ateam score
115 17.08. 22:00 Besa Peje Trepca 89 3 : 3
116 17.08. 22:00 Ferronikeli Hajvalia 2 : 5
117 17.08. 22:00 Trepca Mitrovice Ferizaj 1 : 0
118 17.08. 22:00 Vellaznimi Drenica 2 : 1
119 16.08. 22:00 Kosova Vushtrri Drita 0 : 1
120 16.08. 22:00 Prishtina Istogu 2 : 1

carry out further formatting as needed.

Unable to scrape data from this site (Using R)

Below a possibile solution.

#Library to scrape the infomration Version 1.7.7 (mandatory)
library(RSelenium)
driver <- rsDriver(browser=c("firefox"), port = 4567L)

#Defines the client part.
remote_driver <- driver[["client"]]
remote_driver$navigate("https://www.rbcroyalbank.com/investments/gic-rates.html")
webElem <- remote_driver$findElement(using = "css selector", value = "#gic-nrg")$clickElement()
x<-remote_driver$findElement(using = "css selector", value = "#guaranteed-return-1 > div:nth-child(1) > table:nth-child(1)")
df<-read.table(text=gsub(' ', '\n', x$getElementText()), header=TRUE)
df[c(-1:-46),]

How to scrape data from this specific webpage and save the output in a data frame?

A slower approach to the answer. I've added the attribute trim = TRUE to remove extra whitespace.

One issue with MealPlan is that there are a few with class .noprice. Oneo way to exclude them is to use xpath in html_nodes instead of CSS selectors. I don't know if there is a way to do it with CSS selectors. What I did below was extract both then take a set difference of them.

For the price I've used regular expression to get rid of the extra space in the price.

library(rvest)
library(dplyr)
library(stringr)

url <- "https://www.hotelissima.fr/s/h/ile-maurice/mahebourg/astroea-beach.html?searchType=accomodation&searchId=4&guideId=&filters=&withFlights=false&airportCode=PAR&airport=Paris&search=astroea+beach&startdate=08%2F11%2F2021&stopdate=15%2F11%2F2021&duration=7&travelers=En+couple&travelType=&rooms%5B0%5D.nbAdults=2&rooms%5B0%5D.nbChilds=0&rooms%5B0%5D.birthdates%5B0%5D=&rooms%5B0%5D.birthdates%5B1%5D=&rooms%5B0%5D.birthdates%5B2%5D=&rooms%5B0%5D.birthdates%5B3%5D=&rooms%5B0%5D.birthdates%5B4%5D="

Price <- read_html(url) %>%
html_nodes(".price") %>%
html_text(trim = TRUE) %>%
str_replace("(\\d)\\s(\\d)", "\\1\\2")

RoomType <- read_html(url) %>%
html_nodes(".room h3") %>%
html_text(trim = TRUE)

AllMealPlans <- read_html(url) %>%
html_nodes(".meal-plan-text") %>%
html_text(trim = TRUE)

MealPlansNoPrice <- read_html(url) %>%
html_nodes(".noprice .meal-plan-text") %>%
html_text(trim = TRUE)

MealPlan <- setdiff(AllMealPlans, MealPlansNoPrice)

NumberMealPlans <- length(MealPlan)
NumberRoomTypes <- length(RoomType)

MealPlanColumn <- MealPlan %>% rep(times=NumberRoomTypes)

RoomTypeColumn <- RoomType %>%
rep(each = NumberMealPlans)

bind_cols(RoomType = RoomTypeColumn, MealPlan = MealPlanColumn, Price = Price)

Scraping data off site using 4 urls for one day using R

You can turn all the tables into a wide data frame with list operations:

library(rvest)
library(magrittr)
library(dplyr)

date <- 20130701
rng <- c(1:4)

my_tabs <- lapply(rng, function(i) {
url <- sprintf("http://apims.doe.gov.my/apims/hourly%d.php?date=%s", i, date)
pg <- html(url)
pg %>% html_nodes("table") %>% extract2(1) %>% html_table(header=TRUE)
})

glimpse(plyr::join_all(my_tabs, by=colnames(my_tabs[[1]][1:2])))

## Observations: 52
## Variables:
## $ NEGERI / STATE (chr) "Johor", "Johor", "Johor", "Johor", "Kedah...
## $ KAWASAN/AREA (chr) "Kota Tinggi", "Larkin Lama", "Muar", "Pas...
## $ MASA/TIME12:00AM (chr) "63*", "53*", "51*", "55*", "37*", "48*", ...
## $ MASA/TIME01:00AM (chr) "62*", "52*", "52*", "55*", "36*", "48*", ...
## $ MASA/TIME02:00AM (chr) "61*", "51*", "53*", "55*", "35*", "48*", ...
## $ MASA/TIME03:00AM (chr) "60*", "50*", "54*", "55*", "35*", "48*", ...
## $ MASA/TIME04:00AM (chr) "59*", "49*", "54*", "54*", "34*", "47*", ...
## $ MASA/TIME05:00AM (chr) "58*", "48*", "54*", "54*", "34*", "45*", ...
## $ MASA/TIME06:00AM (chr) "57*", "47*", "53*", "53*", "33*", "45*", ...
## $ MASA/TIME07:00AM (chr) "57*", "46*", "52*", "53*", "32*", "45*", ...
## $ MASA/TIME08:00AM (chr) "56*", "45*", "52*", "52*", "32*", "44*", ...
## ...

I rarely actually load/use plyr anymore due to naming collisions with dplyr but join_all is perfect for this situation.

It's also likely you'll need this data in long format:

plyr::join_all(my_tabs, by=colnames(my_tabs[[1]][1:2])) %>% 
tidyr::gather(masa, nilai, -1, -2) %>%
# better column names
rename(nigeri=`NEGERI / STATE`, kawasan=`KAWASAN/AREA`) %>%
# cleanup & convert time (using local timezone)
# make readings numeric; NA will sub for #
mutate(masa=gsub("MASA/TIME", "", masa),
masa=as.POSIXct(sprintf("%s %s", date, masa), format="%Y%m%d %H:%M%p", tz="Asia/Kuala_Lumpur"),
nilai=as.numeric(gsub("[[:punct:]]+", "", nilai))) -> pollut

head(pollut)
## nigeri kawasan masa nilai
## 1 Johor Kota Tinggi 2013-07-01 12:00:00 63
## 2 Johor Larkin Lama 2013-07-01 12:00:00 53
## 3 Johor Muar 2013-07-01 12:00:00 51
## 4 Johor Pasir Gudang 2013-07-01 12:00:00 55
## 5 Kedah Alor Setar 2013-07-01 12:00:00 37
## 6 Kedah Bakar Arang, Sg. Petani 2013-07-01 12:00:00 48

Using R to scrape data from a table populated possibly with javascript

The solution to this was the following.

  1. Using the source code, identify the source html for the table
  2. Navigate to the source page, and use Chrome developer tools > Network > XHR
  3. Refresh the page to find the source of the data
  4. Scrape from that source

Thanks to @XR SC for his answer here: web scraping using Chrome Dev Tools for providing the basic approach.

web scraping of tables generated using JavaScript

I saw no robots.txt nor a T&C but I did read through the (quite daunting) "APPLICATION TO USE RESTRICTED MICRODATA" (I forgot I had an account that can access IPUMS though I don't recall ever using it). I'm impressed at their desire to register the importance of the potentially sensitive nature of their data up front before download.

Since this metadata has no "microdata" in it (it appears the metadata is provided to help folks decide what data elements they can select) and since acquisition & use of it doesn't violate any of the stated restrictions, the following should be OK. If a rep of IPUMS sees this and disagrees, I'll gladly remove the answer and ask the SO admins to really delete it, too (for those who aren't aware, folks w/high enough rep can see deleted answers).

Now, you don't need Selenium or Splash for this but you'll need to do some post-processing of the data retrieved by the below code.

The data that builds the metadata tables is in a javascript blob in a <script> tag (Use "View Source" to see it, you're going to need it later). We can use some string munging & the V8 package to get it:

library(V8)
library(rvest)
library(jsonlite)
library(stringi)

pg <- read_html("https://international.ipums.org/international-action/variables/MIGYRSBR#codes_section")

html_nodes(pg, xpath=".//script[contains(., 'Less than')]") %>%
html_text() %>%
stri_split_lines() %>%
.[[1]] -> js_lines

idx <- which(stri_detect_fixed(js_lines, '$(document).ready(function() {')) - 1

That finds the target <script> element, gets the contents, converts it to lines and finds the first line that isn't the data. We can only pull out the javascript code with the data since the V8 engine in R isn't a full browser and can't execute the jQuery code after it.

We now create a "V8 context", extract the code and execute it in said V8 context and retrieve it back:

ctx <- v8()

ctx$eval(paste0(js_lines[1:idx], collapse="\n"))

code_data <- ctx$get("codeData")

str(code_data)
## List of 14
## $ jsonPath : chr "/international-action/frequencies/MIGYRSBR"
## $ samples :'data.frame': 6 obs. of 2 variables:
## ..$ name: chr [1:6] "br1960a" "br1970a" "br1980a" "br1991a" ...
## ..$ id : int [1:6] 2416 2417 2418 2419 2420 2651
## $ categories :'data.frame': 100 obs. of 5 variables:
## ..$ id : int [1:100] 4725113 4725114 4725115 4725116 4725117 4725118 4725119 4725120 4725121 4725122 ...
## ..$ label : chr [1:100] "Less than 1 year" "1" "2" "3" ...
## ..$ indent : int [1:100] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ code : chr [1:100] "00" "01" "02" "03" ...
## ..$ general: logi [1:100] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ longSamplesHeader : chr "<tr class=\"fullHeader grayHeader\">\n\n <th class=\"codesColumn\">Code</th>\n <th class=\"la"| __truncated__
## $ samplesHeader : chr "\n<tr class=\"fullHeader grayHeader\">\n <th class=\"codesColumn\">Code</th>\n <th class=\"labelColum"| __truncated__
## $ showCounts : logi FALSE
## $ generalWidth : int 2
## $ width : int 2
## $ interval : int 25
## $ isGeneral : logi FALSE
## $ frequencyType : NULL
## $ project_uses_survey_groups: logi FALSE
## $ variables_show_tab_1 : chr ""
## $ header_type : chr "short"

The jsonPath component suggests it uses more data in the building of the codes & frequencies tables, so we can get it, too:

code_json <- fromJSON(sprintf("https://international.ipums.org%s", code_data$jsonPath))

str(code_json, 1)
## List of 6
## $ 2416:List of 100
## $ 2417:List of 100
## $ 2418:List of 100
## $ 2419:List of 100
## $ 2420:List of 100
## $ 2651:List of 100

Those "Lists of 100" are 100 numbers each.

You'll need to look at the code in the "View Source" (as suggested above) to see how you might be able to use those two bits of data to re-create the metadata table.

I do think you'd be better off following the path @alistaire started you on but follow it fully. I saw no questions about obtaining "codes and frequencies" or "metadata" (such as this) in the forum (http://answers.popdata.org/) and read in at least 5 places the the IPUMS staff reads and answers questions in the forums and also at their info-email address: ipums@umn.edu.

They obviously have this metadata somewhere electronically and could likely give you a complete dump of it across all data products to avoid further scraping (which my guess is your goal since I can't imagine a scenario where one wld want to go through this trouble for one extract).



Related Topics



Leave a reply



Submit