R XML - combining parent and child nodes(w same name) into data frame
Here is a solution to try. See comments for an explanation of the process steps:
library(xml2)
library(dplyr)
x <- read_xml('player.xml')
Players3 <- x %>% xml_find_all('//Player')
dfs<-lapply(Players3, function(node){
#find names of all children nodes
childnodes<-node %>% xml_children() %>% xml_name()
#find the attr value from all child nodes
names<-node %>% xml_children() %>% xml_attr("Type")
#create columns names based on either node name or attr value
names<-ifelse(is.na(names), childnodes, names)
#find all values
values<-node %>% xml_children() %>% xml_text()
#create data frame and properly label the columns
df<-data.frame(t(values), stringsAsFactors = FALSE)
names(df)<-names
df
})
#bind together and add uid to final dataframe.
answer<-bind_rows(dfs)
answer$UID<- Players3 %>% xml_attr("uID")
answer
# Name Position first_name last_name birth_date weight height jersey_num real_position
# 1 Denis Petric Goalkeeper Denis Petric 1988-05-24 83 187 1 Goalkeeper
# 2 Mathieu Michel Goalkeeper Mathieu Michel 1991-09-04 84 189 1 Goalkeeper
# real_position_side join_date country birth_place first_nationality preferred_foot UID
# 1 Unknown 2016-01-02 Slovenia <NA> <NA> <NA> p40511
# 2 Unknown 2016-08-18 France Nimes France Right p119744
Parse xml and combine child attribute
Here is a solution using the xml2 package. It is straight forward, read the "item" parent nodes, and parse out the title and creator. Then using lapply
process each parent node to parse and merge the multiple child nodes together. Finally merger everything together.
library(xml2)
library(dplyr)
#read page and parent nodes
page <- read_xml(test.xml)
items <- page %>% xml_find_all("item")
#get title and creator (assuming 1 per parent)
title <- items %>% xml_find_first("title") %>% xml_text()
creator <- items %>% xml_find_first("creator") %>% xml_text()
#find the multip;e tag and store nodes per parent
#collapse the multiples into 1 value
dfs <- lapply(items, function(node){
tag <- node %>% xml_find_all(xpath='.//category[@domain="tag"]') %>% xml_text()
tag <- paste(tag, collapse = ", ")
store <- node %>% xml_find_all(xpath='.//category[@domain="store"]') %>% xml_text()
store <- paste(store, collapse = ", ")
data.frame(tag, store)
})
#combine everything into 1 data frame
finalanswer <- data.frame(title, creator, bind_rows(dfs))
XML nodes to R data frame with all higher-level node attributes as columns
You could do this:
library(xml2)
library(purrr)
library(readr)
library(rvest)
library(tibble)
individuals <- read_xml('test.xml') %>%
xml_find_all('//individual')
to_add <- function(individual, xpath) individual %>%
html_nodes(xpath = xpath) %>%
{list(html_text(.), html_name(.))} %>%
{setNames(object = .[[1]], nm = .[[2]])}
get_data <- function(individual){
out <- c(
individual %>% html_attrs(),
individual %>% html_nodes(xpath = "..") %>% html_attrs() %>% unlist,
individual %>% html_nodes(xpath = "../..") %>% html_attrs() %>% unlist,
individual %>% html_nodes(xpath = "../../..") %>% html_attrs() %>% unlist
)
xpathes <- c("../../*[not(descendant::*)]", "../*[not(descendant::*)]", "*")
c(sapply(xpathes, to_add, individual = individual, USE.NAMES = FALSE) %>% unlist, out)
}
And then:
lapply(individuals, get_data) %>%
do.call(what = cbind) %>%
as.tibble
Parse xml to dataframe including children and attributes in R
Since you use xml2
and require various data nodes that differ across nested levels, consider XSLT, the special-purpose language (like SQL) designed to transform XML files. In R, the xslt
package, sister module to xml2
, can run XSLT 1.0 scripts. The recursive, template nature of XSLT helps avoid complex nested loops or mapping at application layer, here being R. Plus XSLT is portable (like SQL) and can be run outside of R.
While this may be a whole new concept out of left field requiring a learning curve, it cleanly flattens your XML to the 2-D structure needed for data sets. You also separate XML handling (XSLT) from data handling (R). Specifically, only Player level is retained with respective Team data migrated down (see demo).
XSLT (save as .xsl, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="/SoccerFeed|SoccerDocument">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="Team">
<xsl:apply-templates select="Player"/>
</xsl:template>
<xsl:template match="Team/@*">
<xsl:element name="{concat('team_', name(.))}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:template>
<xsl:template match="Player">
<xsl:copy>
<xsl:apply-templates select="ancestor::Team/@*"/>
<xsl:copy-of select="Name|Position"/>
<xsl:apply-templates select="@*|Stat"/>
</xsl:copy>
</xsl:template>
<xsl:template match="Player/@*">
<xsl:element name="{name(.)}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:template>
<xsl:template match="Stat">
<xsl:element name="{@Type}">
<xsl:value-of select="text()"/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>
Online Demo
R (results in data frame of all character types)
library(xml2)
library(xslt)
library(dplyr)
# INPUT SOURCE
doc <- read_xml("/path/to/Input.xml")
style <- read_xml("/path/to/Style.xsl", package = "xslt")
# TRANSFORM
new_xml <- xml_xslt(doc, style)
# RETRIEVE Player NODES
recs <- xml_find_all(new_xml, "//Player")
# BIND EACH CHILD TEXT AND NAME TO Player DFs
df_list <- lapply(recs, function(r)
data.frame(rbind(setNames(xml_text(xml_children(r)),
xml_name(xml_children(r)))),
stringsAsFactors = FALSE)
)
# BIND ALL DFs TO SINGLE MASTER DF
final_df <- bind_rows(df_list)
Converting a nested XML document to a data frame, when information is both in attribute and text nodes
Consider binding attributes with the internal method, xmlAttrsToDataFrame
, and elements with xmlToDataFrame
, assuming only one set of user and sibling tags per record.
library(XML)
...
# BIND ATTRIBUTES AND ELEMENTS
record_df <- cbind(XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//record')),
XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//user')),
xmlToDataFrame(doc, nodes = getNodeSet(doc, "//record"))
)
# RENAME COLUMNS
record_df <- setNames(record_df, c("record_id", "record_size", "user_id", "user_origin",
"record_user", "record_category", "record_rating", "record_text"))
record_df
# record_id record_size user_id user_origin record_user record_category record_rating record_text
# 1 512 1 8412 ab Certificates 80 \nLorem ipsum dolor ...
# 2 452 2 7623 bb Certificates 70 \nUt enim ad minim ...
Melting repeated child nodes XML into a tidy data set using R
The host, ip, hostname, etc will be repeated when you add the fndvuln elements to your data.frame (try data.frame("a", 1:3)
)
x <- lapply(vuln, function(x) data.frame(
host = xpathSApply(x, "." , xmlGetAttr, "id"),
ip = xpathSApply(x, ".//ip", xmlValue),
hostname = xpathSApply(x, ".//hostname", xmlValue),
VulnID = xpathSApply(x, ".//fndvuln" , xmlGetAttr, "id"),
port = xpathSApply(x, ".//fndvuln" , xmlGetAttr, "port") ))
do.call("rbind", x)
host ip hostname VulnID port
1 169274 some_IP_here Some_DNS_name_here 534 80
2 169274 some_IP_here Some_DNS_name_here 1191 22
3 169275 some_IP_here Some_DNS_name_here 5452 ip
4 169275 some_IP_here Some_DNS_name_here 5092 123
5 169275 some_IP_here Some_DNS_name_here 16157 123
xml with nested siblings to data frame in R
Considering xml
holds your example string, here's another strategy for Residential Properties with a varying number of items:
library(XML)
library(plyr)
# xml <- '<ResidentialProperty>........'
doc <- xmlParse(xml, asText = TRUE)
df <- do.call(rbind.fill, lapply(doc['//ResidentialProperty'], function(x) {
names <- xpathSApply(x, './/.', xmlName)
names <- names[which(names == "text") - 1]
values <- xpathSApply(x, ".//text()", xmlValue)
return(as.data.frame(t(setNames(values, names)), stringsAsFactors = FALSE))
}))
df
# StreetNumber StreetName StreetSuffix StateOrProvince StatusChangeDate Latitude Longitude County SchoolDistrict View YearBuilt InteriorFeatures Name Roof Exterior
# 1 11111 111th Avenue Ct WA 2015-07-05T23:48:53.410 11.111111 -111.111111 Pierce Puyallup Territorial 1997 Bath Off Master,Dbl Pane/Storm Windw Vacant Composition Brick,Cement Planked,Wood,Wood Products
Data frame from multiple XML files
We can encapsulate your data gathering process into a function. Since each XML file seems to represent one row in your desired output dataframe, we can use purrr::map_dfr
to construct the data rowwise. I slightly modified your code. See below:
get_row_data <- function(file_name) {
xml <- xml2::read_xml(file_name)
read_text <- \(x, xpath) rvest::html_elements(x, xpath = xpath) |> rvest::html_text(TRUE)
xpaths <- c(
ACTR_Number = "/ANZCTR_Trial/actrnumber",
primary_sponsor_type = "/ANZCTR_Trial/sponsorship/primarysponsortype",
primary_sponsor_name = "/ANZCTR_Trial/sponsorship/primarysponsorname",
primary_sponsor_address = "/ANZCTR_Trial/sponsorship/primarysponsoraddress",
primary_sponsor_country = "/ANZCTR_Trial/sponsorship/primarysponsorcountry",
funding_source_type = "/ANZCTR_Trial/sponsorship/fundingsource/fundingtype",
funding_source_name = "/ANZCTR_Trial/sponsorship/fundingsource/fundingname",
funding_source_address = "/ANZCTR_Trial/sponsorship/fundingsource/fundingaddress",
funding_source_country = "/ANZCTR_Trial/sponsorship/fundingsource/fundingcountry",
secondary_sponsor_type = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsortype",
secondary_sponsor_name = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsorname",
secondary_sponsor_address = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsoraddress",
secondary_sponsor_country = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsorcountry"
)
x <- read_text(xml, paste0(xpaths, collapse = " | "))
names(x) <- names(xpaths)
as_tibble(as.list(x))
}
all_files <- list.files(pattern = ".xml", path = getwd(), full.names = TRUE)
purrr::map_dfr(all_files, get_row_data)
Output
# A tibble: 2 x 13
ACTR_Number primary_sponsor_~ primary_sponsor_n~ primary_sponsor_~ primary_sponsor~ funding_source_~ funding_source_~
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ACTRN12605000003673 Hospital Barwon Health "272-322 Ryrie S~ Australia Commercial sect~ Astra Zeneca
2 ACTRN12605000025639 Other Collaborat~ Australian Gastro~ "88 Mallett St\n~ Australia Commercial sect~ Roche Products ~
# ... with 6 more variables: funding_source_address <chr>, funding_source_country <chr>, secondary_sponsor_type <chr>,
# secondary_sponsor_name <chr>, secondary_sponsor_address <chr>, secondary_sponsor_country <chr>
How to parse XML attributes with parent attribute into data frame in R
The 'trick' is to get a list of alle the edge-nodes, and work with xpath
from there... You can select the Trach-node from each Edge-node using the ancestor
from xpath
.
libraries used
#load libraries
library( xml2 )
library( magrittr )
sample data
doc <- read_xml('<?xml version="1.0" encoding="UTF-8"?>
<TrackMate version="3.8.0">
<Model spatialunits="µm" timeunits="sec">
<AllTracks>
<Track name="Track_2" TRACK_ID="2" NUMBER_SPOTS="140" NUMBER_GAPS="0" >
<Edge SPOT_SOURCE_ID="960769" SPOT_TARGET_ID="960778" LINK_COST="0.08756957830926632" />
<Edge SPOT_SOURCE_ID="958304" SPOT_TARGET_ID="958308" LINK_COST="1.4003359672950089" />
<Edge SPOT_SOURCE_ID="958316" SPOT_TARGET_ID="958322" LINK_COST="1.6985623204008202" />
</Track>
<Track name="Track_145" TRACK_ID="145" NUMBER_SPOTS="141" NUMBER_GAPS="0" >
<Edge SPOT_SOURCE_ID="961623" SPOT_TARGET_ID="961628" LINK_COST="2.2678642015413755" />
<Edge SPOT_SOURCE_ID="962122" SPOT_TARGET_ID="962127" LINK_COST="38.20777704254654" />
<Edge SPOT_SOURCE_ID="961869" SPOT_TARGET_ID="961873" LINK_COST="0.2895609647324684" />
</Track>
</AllTracks>
</Model>
</TrackMate>')
code
#find all edge nodes
edge.nodes <- xml_find_all( doc, ".//Edge")
#build the data.frame
data.frame( TRACK_ID = xml_find_first( edge.nodes, ".//ancestor::Track") %>% xml_attr("TRACK_ID"),
SPOT_SOURCE_ID = edge.nodes %>% xml_attr("SPOT_SOURCE_ID"),
SPOT_TARGET_ID = edge.nodes %>% xml_attr("SPOT_TARGET_ID"),
LINK_COST = edge.nodes %>% xml_attr("LINK_COST") )
output
# TRACK_ID SPOT_SOURCE_ID SPOT_TARGET_ID LINK_COST
# 1 2 960769 960778 0.08756957830926632
# 2 2 958304 958308 1.4003359672950089
# 3 2 958316 958322 1.6985623204008202
# 4 145 961623 961628 2.2678642015413755
# 5 145 962122 962127 38.20777704254654
# 6 145 961869 961873 0.2895609647324684
Related Topics
Making Plot Functions with Ggplot and Aes_String
Partially Color Histogram in R
Different Robust Standard Errors of Logit Regression in Stata and R
Double Clustered Standard Errors for Panel Data
How to Read Data with Different Separators
Harnessing .F List Names with Purrr::Pmap
Asymmetric Expansion of Ggplot Axis Limits
Warning in Install.Packages: Unable to Move Temporary Installation
How to Paste Together the Elements of a Vector in R Without Using a Loop
What Does the Error "Arguments Imply Differing Number of Rows: X, Y" Mean
What Are the Differences Between Concatenating Strings with Cat() and Paste()
Configuration Failed Because Libcurl Was Not Found
How to Access Global/Outer Scope Variable from R Apply Function
Find Names of Columns Which Contain Missing Values
Sum Object in a Column Between an Interval Defined by Another Column