Extract hyperlink from Excel file in R
I found a super convoluted way to extract the hyperlinks:
library(XML)
# rename file to .zip
my.zip.file <- sub("xlsx", "zip", my.excel.file)
file.copy(from = my.excel.file, to = my.zip.file)
# unzip the file
unzip(my.zip.file)
# unzipping produces a bunch of files which we can read using the XML package
# assume sheet1 has our data
xml <- xmlParse("xl/worksheets/sheet1.xml")
# finally grab the hyperlinks
hyperlinks <- xpathApply(xml, "//x:hyperlink/@display", namespaces="x")
Derived from this blogpost.
Extract hyperlink from Excel cell that has text that is hyperlinked
Here is my solution. It took some digging, and I think perhaps I should make a pull request to the project to get this as a feature.
# Reprex to read xlsx hyperlinks
# See: https://stackoverflow.com/questions/24149821/extract-hyperlink-from-excel-file-in-r
library(xlsx)
# First, make a toy spreadsheet with hyperlinks
# https://cran.r-project.org/web/packages/xlsx/xlsx.pdf
wb <- createWorkbook()
sheet1 <- createSheet(wb, "Sheet1")
rows <- createRow(sheet1, 1)
cells <- createCell(rows, colIndex=1)
## Add hyperlinks to a cell
cell <- cells[[1,1]]
address <- "https://poi.apache.org/"
setCellValue(cell, "click me!")
addHyperlink(cell, address)
saveWorkbook(wb, "sample-hyperlink.xlsx")
#;; Clear the objects
rm(list=ls())
# Read in the workbook, extract the link
# Refs:
# https://rforge.net/rJava/
# Load the workbook, get the components
wb <-
loadWorkbook("sample-hyperlink.xlsx")
sheet <-
getSheets(wb)$Sheet1
rows <-
getRows(sheet)
# Get first column (that is the second argument)
cells <-
getCells(rows, 1)
# Note that getCellValue won't get the link
getCellValue(cells[[1]])
#> [1] "click me!"
# This is how you have to extract the hyperlink
library(rJava)
# What java class is this?
str(cells[[1]])
#> Formal class 'jobjRef' [package "rJava"] with 2 slots
#> ..@ jobj :<externalptr>
#> ..@ jclass: chr "org/apache/poi/ss/usermodel/Cell"
# Look up the class: https://poi.apache.org/apidocs/dev/org/apache/poi/ss/usermodel/Cell.html
# methods:
.jmethods(cells[[1]])
#> ...
#> [12] "public org.apache.poi.ss.usermodel.Hyperlink org.apache.poi.xssf.usermodel.XSSFCell.getHyperlink()"
#> [13] "public org.apache.poi.xssf.usermodel.XSSFHyperlink org.apache.poi.xssf.usermodel.XSSFCell.getHyperlink()"
#> ...
# This is the method we want
str(J(cells[[1]], "getHyperlink"))
#> Formal class 'jobjRef' [package "rJava"] with 2 slots
#> ..@ jobj :<externalptr>
#> ..@ jclass: chr "org/apache/poi/xssf/usermodel/XSSFHyperlink"
# Get methods on the objects returned:
.jmethods(J(cells[[1]], "getHyperlink"))
#> [1] "public java.lang.String org.apache.poi.xssf.usermodel.XSSFHyperlink.getAddress()"
#> ...
# This gets the link
J(J(cells[[1]], "getHyperlink"), "getAddress")
#> [1] "https://poi.apache.org/"
Created on 2021-11-17 by the reprex package (v2.0.1)
Extract internal hyperlink from xlsx workbook
Found my own answer:
Once the xmlfile is loaded as per above, get the links via
# grab the hyperlinks, and dump the XML components by convert them to character vector
linkReferences <- tibble(
RefCell = as.character( xpathApply(xmlfile, "//x:hyperlink/@ref", namespaces="x")),
LinkCell = as.character( xpathApply(xmlfile, "//x:hyperlink/@location", namespaces="x"))
)
linkReferences
# A tibble: 5 x 2
RefCell LinkCell
<chr> <chr>
1 A2 linksTab!B3
2 A3:A5 linksTab!B3
3 A3 linksTab!D3
4 A4 linksTab!F3
5 A5 linksTab!H3
Download several Excel files from url using R
The XML
package has some good tools to process web pages, in particular, for extracting the links.
library(XML)
url <- "https://digital.nhs.uk/data-and-information/publications/statistical/adult-psychiatric-morbidity-survey/adult-psychiatric-morbidity-survey-survey-of-mental-health-and-wellbeing-england-2014"
pageContent <- readLines(url)
Links <- getHTMLLinks(pageContent)
xlsFiles <- grep("\\.xls", Links)
Links[xlsFiles]
[1] "https://files.digital.nhs.uk/excel/9/s/apms-2014-ch-02-tabs.xls"
[2] "https://files.digital.nhs.uk/excel/9/b/apms-2014-ch-03-tabs.xls"
[3] "https://files.digital.nhs.uk/excel/a/i/apms-2014-ch-04-tabs.xls"
[4] "https://files.digital.nhs.uk/excel/a/t/apms-2014-ch-05-tabs.xls"
[5] "https://files.digital.nhs.uk/excel/b/m/apms-2014-ch-06-tabs.xls"
[6] "https://files.digital.nhs.uk/excel/b/p/apms-2014-ch-07-tabs.xls"
[7] "https://files.digital.nhs.uk/excel/b/l/apms-2014-ch-08-tabs.xls"
[8] "https://files.digital.nhs.uk/excel/c/1/apms-2014-ch-09-tabs.xls"
[9] "https://files.digital.nhs.uk/excel/s/0/apms-2014-ch-10-tabs.xls"
[10] "https://files.digital.nhs.uk/excel/c/p/apms-2014-ch-11-tabs.xls"
[11] "https://files.digital.nhs.uk/6F/FB2F1B/apms-2014-ch-12-tabs.xls"
[12] "https://files.digital.nhs.uk/excel/d/g/apms-2014-ch-13-tabs.xls"
[13] "https://files.digital.nhs.uk/excel/d/r/apms-2014-ch-14-tabs.xls"
how to scrape a hyperlink by R and keep the hyperlink clickable in the output file?
This is a straight forward problem using the rvest package. The principal is to read the page, extract the desired nodes using the CSS selectors and then extracting the requested information.
The tricky part here is to isolate the links associated only with the questions and none of the others. In this case I needed 3-4 levels CSS tags to complete separation.
See the comments in the code for the step by step instructions.
library(rvest)
url<-"https://stackoverflow.com/questions/tagged/r?tab=votes&page=1&pagesize=50"
#read the page
page<-read_html(url)
#get hyperlink nodes
#the 'a' tag under a 'h3' tag under 'div' tag of class 'summary' under a 'div' tag of class 'question-summary'
nodes<-html_nodes(page, "div.question-summary div.summary h3 a")
#Get text
question<-html_text(nodes)
#get link
link<-paste0("https://stackoverflow.com", html_attr(nodes, "href"))
answer<-data.frame(question, link)
head(answer)
question link
1 How to make a great R reproducible example /questions/5963269/how-to-make-a-great-r-reproducible-example
2 How to sort a dataframe by multiple column(s) /questions/1296646/how-to-sort-a-dataframe-by-multiple-columns
3 How to join (merge) data frames (inner, outer, left, right) /questions/1299871/how-to-join-merge-data-frames-inner-outer-left-right
4 Grouping functions (tapply, by, aggregate) and the *apply family /questions/3505701/grouping-functions-tapply-by-aggregate-and-the-apply-family
5 Drop data frame columns by name /questions/4605206/drop-data-frame-columns-by-name
6 Remove rows with all or some NAs (missing values) in data.frame /questions/4862178/remove-rows-with-all-or-some-nas-missing-values-in-data-frame
Identify all excel files from a webpage with Rvest
Following the detail in the comments it seems you want the Excel files from within "04. Personal y remuneraciones".
The folders housing the Excel files are public facing and so you can simply use the parent folder URI, extract the links for the child year folders, then from those extract the Excel file links.
library(magrittr)
library(rvest)
year_folder <- "https://transparencia.uv.cl/documentos/personal/remuneraciones/contrata/"
extract_links <- function(parent_link, css_selector_list) {
links <- read_html(parent_link) %>%
html_elements(css_selector_list) %>%
html_attr("href") %>%
paste0(parent_link, .)
return(links)
}
folders_by_year <- extract_links(year_folder, 'li [href$="/"]:not([href^="/"])')
excel_files <- lapply(folders_by_year, extract_links, 'li [href$=".xls"]')
Read Excel file from a URL using the readxl package
This works for me on Windows:
library(readxl)
library(httr)
packageVersion("readxl")
# [1] ‘0.1.1’
GET(url1, write_disk(tf <- tempfile(fileext = ".xls")))
df <- read_excel(tf, 2L)
str(df)
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 20131 obs. of 8 variables:
# $ Code : chr "C115388" "C115800" "C115801" "C115802" ...
# $ Codelist Code : chr NA "C115388" "C115388" "C115388" ...
# $ Codelist Extensible (Yes/No): chr "No" NA NA NA ...
# $ Codelist Name : chr "6 Minute Walk Functional Test Test Code" "6 Minute Walk Functional Test Test Code" "6 Minute Walk Functional Test Test Code" "6 Minute Walk Functional Test Test Code" ...
# $ CDISC Submission Value : chr "SIXMW1TC" "SIXMW101" "SIXMW102" "SIXMW103" ...
# $ CDISC Synonym(s) : chr "6 Minute Walk Functional Test Test Code" "SIXMW1-Distance at 1 Minute" "SIXMW1-Distance at 2 Minutes" "SIXMW1-Distance at 3 Minutes" ...
# $ CDISC Definition : chr "6 Minute Walk Test test code." "6 Minute Walk Test - Distance at 1 minute." "6 Minute Walk Test - Distance at 2 minutes." "6 Minute Walk Test - Distance at 3 minutes." ...
# $ NCI Preferred Term : chr "CDISC Functional Test 6MWT Test Code Terminology" "6MWT - Distance at 1 Minute" "6MWT - Distance at 2 Minutes" "6MWT - Distance at 3 Minutes" ...
Related Topics
Topic Models: Cross Validation with Loglikelihood or Perplexity
What Are Examples of When Seq_Along Works, But Seq Produces Unintended Results
How to Optimize for Integer Parameters (And Other Discontinuous Parameter Space) in R
How to Remove Leading "0." in a Numeric R Variable
Why Does Dplyr's Filter Drop Na Values from a Factor Variable
Dplyr::Select() with Some Variables That May Not Exist in the Data Frame
Adding a Simple Lm Trend Line to a Ggplot Boxplot
How to Read a Text File into Gnu R with a Multiple-Byte Separator
Date-Time Differences Between Rows in R
Regex; Eliminate All Punctuation Except
How to Pass Individual 'Curvature' Arguments in 'Ggplot2' 'Geom_Curve' Function
How to Color Entire Background in Ggplot2 When Using Coord_Fixed
Extract Certain Files from .Zip
How to Replace Certain Values in a Specific Rows and Columns with Na in R