Convert Xml_Nodeset to Data.Frame

Convert xml_nodeset to data.frame

This will get you all the attributes from the links into a tbl_df. bind_rows gets you "fill" for free:

library(rvest)
library(dplyr)

pg <- read_html("https://en.wikipedia.org/wiki/Main_Page")
links <- html_nodes(pg, "a")
bind_rows(lapply(xml_attrs(links), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))

## Source: local data frame [310 x 10]
##
## id href title class dir accesskey rel lang hreflang style
## (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
## 1 top NA NA NA NA NA NA NA NA NA
## 2 NA #mw-head NA NA NA NA NA NA NA NA
## 3 NA #p-search NA NA NA NA NA NA NA NA
## 4 NA /wiki/Wikipedia Wikipedia NA NA NA NA NA NA NA
## 5 NA /wiki/Free_content Free content NA NA NA NA NA NA NA
## 6 NA /wiki/Encyclopedia Encyclopedia NA NA NA NA NA NA NA
## 7 NA /wiki/Wikipedia:Introduction Wikipedia:Introduction NA NA NA NA NA NA NA
## 8 NA /wiki/Special:Statistics Special:Statistics NA NA NA NA NA NA NA
## 9 NA /wiki/English_language English language NA NA NA NA NA NA NA
## 10 NA /wiki/Portal:Arts Portal:Arts NA NA NA NA NA NA NA
## .. ... ... ... ... ... ... ... ... ... ...

Alternately, you could use purrr:

library(rvest)
library(purrr)

pg <- read_html("https://en.wikipedia.org/wiki/Main_Page")
html_nodes(pg, "a") %>%
map(xml_attrs) %>%
map_df(~as.list(.))

## # A tibble: 342 × 10
## id href title class dir accesskey rel hreflang lang style
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 top <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 <NA> #mw-head <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 <NA> #p-search <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 <NA> /wiki/Wikipedia Wikipedia <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 <NA> /wiki/Free_content Free content <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 <NA> /wiki/Encyclopedia Encyclopedia <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 7 <NA> /wiki/Wikipedia:Introduction Wikipedia:Introduction <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 8 <NA> /wiki/Special:Statistics Special:Statistics <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 9 <NA> /wiki/English_language English language <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 10 <NA> /wiki/Portal:Arts Portal:Arts <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## # ... with 332 more rows

which I think is more functionally idiomatic and an overall cleaner approach.

XML_Nodeset to Dataframe

here is a diofferent approach, using the json-data provided on the site

library(xml2)
library(jsonlite)
library(magrittr)

url %>%
xml2::read_html() %>%
xml2::xml_find_all("//div[@id='player_stats_json' and position()=2]") %>%
xml2::xml_text() %>%
jsonlite::fromJSON() %>%
as.data.frame() %>% t()

# [,1]
# test 0
# ppace 90
# pshooting 88
# ppassing 92
# pdribbling 95
# pdefending 91
# pphysical 88
# acceleration 93
# sprintspeed 88
# agility 86
# balance 98
# reactions 93
# ballcontrol 97
# dribbling 95
# positioning 90
# finishing 84
# shotpower 94
# longshotsaccuracy 92
# volleys 76
# penalties 84
# interceptions 93
# headingaccuracy 70
# marking 92
# standingtackle 96
# slidingtackle 85
# vision 96
# crossing 82
# freekickaccuracy 67
# shortpassing 99
# longpassing 94
# curve 85
# jumping 84
# stamina 94
# strength 84
# aggression 93
# composure 98

Coerce elements in xml/html document to data.frame

With xml that is already in a simple structure such as this, it can easily be converted using xmlToDataFrame() from XML package as follows:

library(XML)

xmlstr <-
"<grp>
<elementx>
<sub_a>a</sub_a>
<sub_b>b</sub_b>
<sub_c>c</sub_c>
</elementx>
<elementx>
<sub_a>1</sub_a>
<sub_b>2</sub_b>
<sub_c>3</sub_c>
</elementx>
</grp>"

df <- xmlToDataFrame(xmlstr)
df
# sub_a sub_b sub_c
#1 a b c
#2 1 2 3

If the xml is in a file, rather than a string, then you can use this:

xmlstr <- readLines("test.xml")
df <- xmlToDataFrame(xmlstr)

If you want to get your xml directly from an http address, (as in the comment to your question) then this works

doc = readLines('https://www.gov.uk/government/announcements.atom?announcement_filter_option=statements&topics%5B%5D=transport')
df2 <- xmlToDataFrame(doc)

If you are dealing with very large xml files that are slow to convert this way, this blog post describes a faster function hosted on github that you could try: require(devtools); install_github("processVISION", "muschellij2")

Parsing XML files in R:Extract dataframe from XML nodeset

Doing sapply with xAttrs solved my problem of extracting the values under the node "Segment". This worked fine for me

nodes <- getNodeSet(input,"//Segment[@score='30']")
all_parameters <- sapply(nodes, xmlAttrs)

How to convert hmtl_node to list in R

The function which I was searching was html_children().

The script now reads like this:

temp_list <- html_out %>%
html_node(".project-row") %>%
html_children %>%
html_text() %>%
gsub("\r\n", "", .) %>% # remove \r\n
gsub("^ *|(?<= ) | *$", "", ., perl = TRUE) # removing multiple spaces

Applying html_children converts temp_list from an object of class xml_node to a xml_nodeset. xml_nodeset can be used similar to a list, for example by addressing single elements via temp_list[j]. Applying then html_text on the xml_nodeset retains the list-like structure and does not concatenate the elements together.

R: convert XML data to data frame

It may not be as verbose as the XML package but xml2 doesn't have the memory leaks and is laser-focused on data extraction. I use trimws which is a really recent addition to R core.

library(xml2)

pg <- read_xml("http://www.ggobi.org/book/data/olive.xml")

# get all the <record>s
recs <- xml_find_all(pg, "//record")

# extract and clean all the columns
vals <- trimws(xml_text(recs))

# extract and clean (if needed) the area names
labs <- trimws(xml_attr(recs, "label"))

# mine the column names from the two variable descriptions
# this XPath construct lets us grab either the <categ…> or <real…> tags
# and then grabs the 'name' attribute of them
cols <- xml_attr(xml_find_all(pg, "//data/variables/*[self::categoricalvariable or
self::realvariable]"), "name")

# this converts each set of <record> columns to a data frame
# after first converting each row to numeric and assigning
# names to each column (making it easier to do the matrix to data frame conv)
dat <- do.call(rbind, lapply(strsplit(vals, "\ +"),
function(x) {
data.frame(rbind(setNames(as.numeric(x),cols)))
}))

# then assign the area name column to the data frame
dat$area_name <- labs

head(dat)
## region area palmitic palmitoleic stearic oleic linoleic linolenic
## 1 1 1 1075 75 226 7823 672 NA
## 2 1 1 1088 73 224 7709 781 31
## 3 1 1 911 54 246 8113 549 31
## 4 1 1 966 57 240 7952 619 50
## 5 1 1 1051 67 259 7771 672 50
## 6 1 1 911 49 268 7924 678 51
## arachidic eicosenoic area_name
## 1 60 29 North-Apulia
## 2 61 29 North-Apulia
## 3 63 29 North-Apulia
## 4 78 35 North-Apulia
## 5 80 46 North-Apulia
## 6 70 44 North-Apulia

UPDATE

I'd prbly do the last bit this way now:

library(tidyverse)

strsplit(vals, "[[:space:]]+") %>%
map_df(~as_data_frame(as.list(setNames(., cols)))) %>%
mutate(area_name=labs)


Related Topics



Leave a reply



Submit