Extract Hyperlink from Excel File in R

Extract hyperlink from Excel file in R

I found a super convoluted way to extract the hyperlinks:

library(XML)

# rename file to .zip
my.zip.file <- sub("xlsx", "zip", my.excel.file)
file.copy(from = my.excel.file, to = my.zip.file)

# unzip the file
unzip(my.zip.file)

# unzipping produces a bunch of files which we can read using the XML package
# assume sheet1 has our data
xml <- xmlParse("xl/worksheets/sheet1.xml")

# finally grab the hyperlinks
hyperlinks <- xpathApply(xml, "//x:hyperlink/@display", namespaces="x")

Derived from this blogpost.

Extract hyperlink from Excel cell that has text that is hyperlinked

Here is my solution. It took some digging, and I think perhaps I should make a pull request to the project to get this as a feature.

# Reprex to read xlsx hyperlinks
# See: https://stackoverflow.com/questions/24149821/extract-hyperlink-from-excel-file-in-r
library(xlsx)

# First, make a toy spreadsheet with hyperlinks
# https://cran.r-project.org/web/packages/xlsx/xlsx.pdf

wb <- createWorkbook()
sheet1 <- createSheet(wb, "Sheet1")
rows <- createRow(sheet1, 1)
cells <- createCell(rows, colIndex=1)

## Add hyperlinks to a cell
cell <- cells[[1,1]]
address <- "https://poi.apache.org/"
setCellValue(cell, "click me!")
addHyperlink(cell, address)

saveWorkbook(wb, "sample-hyperlink.xlsx")

#;; Clear the objects
rm(list=ls())

# Read in the workbook, extract the link
# Refs:
# https://rforge.net/rJava/

# Load the workbook, get the components
wb <-
loadWorkbook("sample-hyperlink.xlsx")
sheet <-
getSheets(wb)$Sheet1
rows <-
getRows(sheet)
# Get first column (that is the second argument)
cells <-
getCells(rows, 1)

# Note that getCellValue won't get the link
getCellValue(cells[[1]])
#> [1] "click me!"

# This is how you have to extract the hyperlink
library(rJava)

# What java class is this?
str(cells[[1]])
#> Formal class 'jobjRef' [package "rJava"] with 2 slots
#> ..@ jobj :<externalptr>
#> ..@ jclass: chr "org/apache/poi/ss/usermodel/Cell"

# Look up the class: https://poi.apache.org/apidocs/dev/org/apache/poi/ss/usermodel/Cell.html

# methods:
.jmethods(cells[[1]])

#> ...
#> [12] "public org.apache.poi.ss.usermodel.Hyperlink org.apache.poi.xssf.usermodel.XSSFCell.getHyperlink()"
#> [13] "public org.apache.poi.xssf.usermodel.XSSFHyperlink org.apache.poi.xssf.usermodel.XSSFCell.getHyperlink()"
#> ...

# This is the method we want
str(J(cells[[1]], "getHyperlink"))
#> Formal class 'jobjRef' [package "rJava"] with 2 slots
#> ..@ jobj :<externalptr>
#> ..@ jclass: chr "org/apache/poi/xssf/usermodel/XSSFHyperlink"

# Get methods on the objects returned:
.jmethods(J(cells[[1]], "getHyperlink"))
#> [1] "public java.lang.String org.apache.poi.xssf.usermodel.XSSFHyperlink.getAddress()"
#> ...

# This gets the link
J(J(cells[[1]], "getHyperlink"), "getAddress")
#> [1] "https://poi.apache.org/"

Created on 2021-11-17 by the reprex package (v2.0.1)

Extract internal hyperlink from xlsx workbook

Found my own answer:
Once the xmlfile is loaded as per above, get the links via

# grab the hyperlinks, and dump the XML components by convert them to character vector
linkReferences <- tibble(
RefCell = as.character( xpathApply(xmlfile, "//x:hyperlink/@ref", namespaces="x")),
LinkCell = as.character( xpathApply(xmlfile, "//x:hyperlink/@location", namespaces="x"))
)

linkReferences
# A tibble: 5 x 2
RefCell LinkCell
<chr> <chr>
1 A2 linksTab!B3
2 A3:A5 linksTab!B3
3 A3 linksTab!D3
4 A4 linksTab!F3
5 A5 linksTab!H3

Download several Excel files from url using R

The XML package has some good tools to process web pages, in particular, for extracting the links.

library(XML)

url <- "https://digital.nhs.uk/data-and-information/publications/statistical/adult-psychiatric-morbidity-survey/adult-psychiatric-morbidity-survey-survey-of-mental-health-and-wellbeing-england-2014"
pageContent <- readLines(url)
Links <- getHTMLLinks(pageContent)
xlsFiles <- grep("\\.xls", Links)
Links[xlsFiles]

[1] "https://files.digital.nhs.uk/excel/9/s/apms-2014-ch-02-tabs.xls"
[2] "https://files.digital.nhs.uk/excel/9/b/apms-2014-ch-03-tabs.xls"
[3] "https://files.digital.nhs.uk/excel/a/i/apms-2014-ch-04-tabs.xls"
[4] "https://files.digital.nhs.uk/excel/a/t/apms-2014-ch-05-tabs.xls"
[5] "https://files.digital.nhs.uk/excel/b/m/apms-2014-ch-06-tabs.xls"
[6] "https://files.digital.nhs.uk/excel/b/p/apms-2014-ch-07-tabs.xls"
[7] "https://files.digital.nhs.uk/excel/b/l/apms-2014-ch-08-tabs.xls"
[8] "https://files.digital.nhs.uk/excel/c/1/apms-2014-ch-09-tabs.xls"
[9] "https://files.digital.nhs.uk/excel/s/0/apms-2014-ch-10-tabs.xls"
[10] "https://files.digital.nhs.uk/excel/c/p/apms-2014-ch-11-tabs.xls"
[11] "https://files.digital.nhs.uk/6F/FB2F1B/apms-2014-ch-12-tabs.xls"
[12] "https://files.digital.nhs.uk/excel/d/g/apms-2014-ch-13-tabs.xls"
[13] "https://files.digital.nhs.uk/excel/d/r/apms-2014-ch-14-tabs.xls"

how to scrape a hyperlink by R and keep the hyperlink clickable in the output file?

This is a straight forward problem using the rvest package. The principal is to read the page, extract the desired nodes using the CSS selectors and then extracting the requested information.

The tricky part here is to isolate the links associated only with the questions and none of the others. In this case I needed 3-4 levels CSS tags to complete separation.

See the comments in the code for the step by step instructions.

library(rvest)

url<-"https://stackoverflow.com/questions/tagged/r?tab=votes&page=1&pagesize=50"

#read the page
page<-read_html(url)

#get hyperlink nodes
#the 'a' tag under a 'h3' tag under 'div' tag of class 'summary' under a 'div' tag of class 'question-summary'
nodes<-html_nodes(page, "div.question-summary div.summary h3 a")

#Get text
question<-html_text(nodes)
#get link
link<-paste0("https://stackoverflow.com", html_attr(nodes, "href"))

answer<-data.frame(question, link)
head(answer)
question link
1 How to make a great R reproducible example /questions/5963269/how-to-make-a-great-r-reproducible-example
2 How to sort a dataframe by multiple column(s) /questions/1296646/how-to-sort-a-dataframe-by-multiple-columns
3 How to join (merge) data frames (inner, outer, left, right) /questions/1299871/how-to-join-merge-data-frames-inner-outer-left-right
4 Grouping functions (tapply, by, aggregate) and the *apply family /questions/3505701/grouping-functions-tapply-by-aggregate-and-the-apply-family
5 Drop data frame columns by name /questions/4605206/drop-data-frame-columns-by-name
6 Remove rows with all or some NAs (missing values) in data.frame /questions/4862178/remove-rows-with-all-or-some-nas-missing-values-in-data-frame

Identify all excel files from a webpage with Rvest

Following the detail in the comments it seems you want the Excel files from within "04. Personal y remuneraciones".

The folders housing the Excel files are public facing and so you can simply use the parent folder URI, extract the links for the child year folders, then from those extract the Excel file links.

library(magrittr)
library(rvest)

year_folder <- "https://transparencia.uv.cl/documentos/personal/remuneraciones/contrata/"

extract_links <- function(parent_link, css_selector_list) {
links <- read_html(parent_link) %>%
html_elements(css_selector_list) %>%
html_attr("href") %>%
paste0(parent_link, .)
return(links)
}

folders_by_year <- extract_links(year_folder, 'li [href$="/"]:not([href^="/"])')

excel_files <- lapply(folders_by_year, extract_links, 'li [href$=".xls"]')

Read Excel file from a URL using the readxl package

This works for me on Windows:

library(readxl)
library(httr)
packageVersion("readxl")
# [1] ‘0.1.1’

GET(url1, write_disk(tf <- tempfile(fileext = ".xls")))
df <- read_excel(tf, 2L)
str(df)
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 20131 obs. of 8 variables:
# $ Code : chr "C115388" "C115800" "C115801" "C115802" ...
# $ Codelist Code : chr NA "C115388" "C115388" "C115388" ...
# $ Codelist Extensible (Yes/No): chr "No" NA NA NA ...
# $ Codelist Name : chr "6 Minute Walk Functional Test Test Code" "6 Minute Walk Functional Test Test Code" "6 Minute Walk Functional Test Test Code" "6 Minute Walk Functional Test Test Code" ...
# $ CDISC Submission Value : chr "SIXMW1TC" "SIXMW101" "SIXMW102" "SIXMW103" ...
# $ CDISC Synonym(s) : chr "6 Minute Walk Functional Test Test Code" "SIXMW1-Distance at 1 Minute" "SIXMW1-Distance at 2 Minutes" "SIXMW1-Distance at 3 Minutes" ...
# $ CDISC Definition : chr "6 Minute Walk Test test code." "6 Minute Walk Test - Distance at 1 minute." "6 Minute Walk Test - Distance at 2 minutes." "6 Minute Walk Test - Distance at 3 minutes." ...
# $ NCI Preferred Term : chr "CDISC Functional Test 6MWT Test Code Terminology" "6MWT - Distance at 1 Minute" "6MWT - Distance at 2 Minutes" "6MWT - Distance at 3 Minutes" ...


Related Topics



Leave a reply



Submit