Convert Xml_Nodeset to Data.Frame

Convert xml_nodeset to data.frame

This will get you all the attributes from the links into a tbl_df. bind_rows gets you "fill" for free:

library(rvest)
library(dplyr)

pg <- read_html("https://en.wikipedia.org/wiki/Main_Page")
links <- html_nodes(pg, "a")
bind_rows(lapply(xml_attrs(links), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))

## Source: local data frame [310 x 10]
## 
##       id                         href                  title class   dir accesskey   rel  lang hreflang style
##    (chr)                        (chr)                  (chr) (chr) (chr)     (chr) (chr) (chr)    (chr) (chr)
## 1    top                           NA                     NA    NA    NA        NA    NA    NA       NA    NA
## 2     NA                     #mw-head                     NA    NA    NA        NA    NA    NA       NA    NA
## 3     NA                    #p-search                     NA    NA    NA        NA    NA    NA       NA    NA
## 4     NA              /wiki/Wikipedia              Wikipedia    NA    NA        NA    NA    NA       NA    NA
## 5     NA           /wiki/Free_content           Free content    NA    NA        NA    NA    NA       NA    NA
## 6     NA           /wiki/Encyclopedia           Encyclopedia    NA    NA        NA    NA    NA       NA    NA
## 7     NA /wiki/Wikipedia:Introduction Wikipedia:Introduction    NA    NA        NA    NA    NA       NA    NA
## 8     NA     /wiki/Special:Statistics     Special:Statistics    NA    NA        NA    NA    NA       NA    NA
## 9     NA       /wiki/English_language       English language    NA    NA        NA    NA    NA       NA    NA
## 10    NA            /wiki/Portal:Arts            Portal:Arts    NA    NA        NA    NA    NA       NA    NA
## ..   ...                          ...                    ...   ...   ...       ...   ...   ...      ...   ...

Alternately, you could use purrr:

library(rvest)
library(purrr)

pg <- read_html("https://en.wikipedia.org/wiki/Main_Page")
html_nodes(pg, "a") %>% 
  map(xml_attrs) %>% 
  map_df(~as.list(.))

## # A tibble: 342 × 10
##       id                         href                  title class   dir accesskey   rel hreflang  lang style
##    <chr>                        <chr>                  <chr> <chr> <chr>     <chr> <chr>    <chr> <chr> <chr>
## 1    top                         <NA>                   <NA>  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 2   <NA>                     #mw-head                   <NA>  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 3   <NA>                    #p-search                   <NA>  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 4   <NA>              /wiki/Wikipedia              Wikipedia  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 5   <NA>           /wiki/Free_content           Free content  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 6   <NA>           /wiki/Encyclopedia           Encyclopedia  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 7   <NA> /wiki/Wikipedia:Introduction Wikipedia:Introduction  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 8   <NA>     /wiki/Special:Statistics     Special:Statistics  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 9   <NA>       /wiki/English_language       English language  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## 10  <NA>            /wiki/Portal:Arts            Portal:Arts  <NA>  <NA>      <NA>  <NA>     <NA>  <NA>  <NA>
## # ... with 332 more rows

which I think is more functionally idiomatic and an overall cleaner approach.

XML_Nodeset to Dataframe

here is a diofferent approach, using the json-data provided on the site

library(xml2)
library(jsonlite)
library(magrittr)

url %>%
  xml2::read_html() %>%
  xml2::xml_find_all("//div[@id='player_stats_json' and position()=2]") %>%
  xml2::xml_text() %>%
  jsonlite::fromJSON() %>%
  as.data.frame() %>% t()
  
#                   [,1]
# test                 0
# ppace               90
# pshooting           88
# ppassing            92
# pdribbling          95
# pdefending          91
# pphysical           88
# acceleration        93
# sprintspeed         88
# agility             86
# balance             98
# reactions           93
# ballcontrol         97
# dribbling           95
# positioning         90
# finishing           84
# shotpower           94
# longshotsaccuracy   92
# volleys             76
# penalties           84
# interceptions       93
# headingaccuracy     70
# marking             92
# standingtackle      96
# slidingtackle       85
# vision              96
# crossing            82
# freekickaccuracy    67
# shortpassing        99
# longpassing         94
# curve               85
# jumping             84
# stamina             94
# strength            84
# aggression          93
# composure           98

Coerce elements in xml/html document to data.frame

With xml that is already in a simple structure such as this, it can easily be converted using xmlToDataFrame() from XML package as follows:

library(XML)

xmlstr <- 
"<grp>
  <elementx>
    <sub_a>a</sub_a>
    <sub_b>b</sub_b>
    <sub_c>c</sub_c>
  </elementx>
  <elementx>
    <sub_a>1</sub_a>
    <sub_b>2</sub_b>
    <sub_c>3</sub_c>
  </elementx>
</grp>"  

df <- xmlToDataFrame(xmlstr)
df
#  sub_a sub_b sub_c
#1     a     b     c
#2     1     2     3

If the xml is in a file, rather than a string, then you can use this:

xmlstr <- readLines("test.xml")
df <- xmlToDataFrame(xmlstr)

If you want to get your xml directly from an http address, (as in the comment to your question) then this works

doc = readLines('https://www.gov.uk/government/announcements.atom?announcement_filter_option=statements&topics%5B%5D=transport')
df2 <- xmlToDataFrame(doc)

If you are dealing with very large xml files that are slow to convert this way, this blog post describes a faster function hosted on github that you could try: require(devtools); install_github("processVISION", "muschellij2")

Parsing XML files in R:Extract dataframe from XML nodeset

Doing sapply with xAttrs solved my problem of extracting the values under the node "Segment". This worked fine for me

nodes <- getNodeSet(input,"//Segment[@score='30']")
all_parameters <- sapply(nodes, xmlAttrs)

How to convert hmtl_node to list in R

The function which I was searching was html_children().

The script now reads like this:

temp_list <- html_out %>%
    html_node(".project-row") %>% 
    html_children %>% 
    html_text() %>%
    gsub("\r\n", "", .) %>% # remove \r\n
    gsub("^ *|(?<= ) | *$", "", ., perl = TRUE) # removing multiple spaces

Applying html_children converts temp_list from an object of class xml_node to a xml_nodeset. xml_nodeset can be used similar to a list, for example by addressing single elements via temp_list[j]. Applying then html_text on the xml_nodeset retains the list-like structure and does not concatenate the elements together.

R: convert XML data to data frame

It may not be as verbose as the XML package but xml2 doesn't have the memory leaks and is laser-focused on data extraction. I use trimws which is a really recent addition to R core.

library(xml2)

pg <- read_xml("http://www.ggobi.org/book/data/olive.xml")

# get all the <record>s
recs <- xml_find_all(pg, "//record")

# extract and clean all the columns
vals <- trimws(xml_text(recs))

# extract and clean (if needed) the area names
labs <- trimws(xml_attr(recs, "label"))

# mine the column names from the two variable descriptions
# this XPath construct lets us grab either the <categ…> or <real…> tags
# and then grabs the 'name' attribute of them
cols <- xml_attr(xml_find_all(pg, "//data/variables/*[self::categoricalvariable or
                                                      self::realvariable]"), "name")

# this converts each set of <record> columns to a data frame
# after first converting each row to numeric and assigning
# names to each column (making it easier to do the matrix to data frame conv)
dat <- do.call(rbind, lapply(strsplit(vals, "\ +"),
                                 function(x) {
                                   data.frame(rbind(setNames(as.numeric(x),cols)))
                                 }))

# then assign the area name column to the data frame
dat$area_name <- labs

head(dat)
##   region area palmitic palmitoleic stearic oleic linoleic linolenic
## 1      1    1     1075          75     226  7823      672        NA
## 2      1    1     1088          73     224  7709      781        31
## 3      1    1      911          54     246  8113      549        31
## 4      1    1      966          57     240  7952      619        50
## 5      1    1     1051          67     259  7771      672        50
## 6      1    1      911          49     268  7924      678        51
##   arachidic eicosenoic    area_name
## 1        60         29 North-Apulia
## 2        61         29 North-Apulia
## 3        63         29 North-Apulia
## 4        78         35 North-Apulia
## 5        80         46 North-Apulia
## 6        70         44 North-Apulia

UPDATE

I'd prbly do the last bit this way now:

library(tidyverse)

strsplit(vals, "[[:space:]]+") %>% 
  map_df(~as_data_frame(as.list(setNames(., cols)))) %>% 
  mutate(area_name=labs)

Convert Xml_Nodeset to Data.Frame