R Xml - Combining Parent and Child Nodes(W Same Name) into Data Frame

R XML - combining parent and child nodes(w same name) into data frame

Here is a solution to try. See comments for an explanation of the process steps:

library(xml2)
library(dplyr)

x <- read_xml('player.xml')

Players3 <- x %>% xml_find_all('//Player')

dfs<-lapply(Players3, function(node){
#find names of all children nodes
childnodes<-node %>% xml_children() %>% xml_name()
#find the attr value from all child nodes
names<-node %>% xml_children() %>% xml_attr("Type")
#create columns names based on either node name or attr value
names<-ifelse(is.na(names), childnodes, names)

#find all values
values<-node %>% xml_children() %>% xml_text()

#create data frame and properly label the columns
df<-data.frame(t(values), stringsAsFactors = FALSE)
names(df)<-names
df
})

#bind together and add uid to final dataframe.
answer<-bind_rows(dfs)
answer$UID<- Players3 %>% xml_attr("uID")
answer

# Name Position first_name last_name birth_date weight height jersey_num real_position
# 1 Denis Petric Goalkeeper Denis Petric 1988-05-24 83 187 1 Goalkeeper
# 2 Mathieu Michel Goalkeeper Mathieu Michel 1991-09-04 84 189 1 Goalkeeper
# real_position_side join_date country birth_place first_nationality preferred_foot UID
# 1 Unknown 2016-01-02 Slovenia <NA> <NA> <NA> p40511
# 2 Unknown 2016-08-18 France Nimes France Right p119744

Parse xml and combine child attribute

Here is a solution using the xml2 package. It is straight forward, read the "item" parent nodes, and parse out the title and creator. Then using lapply process each parent node to parse and merge the multiple child nodes together. Finally merger everything together.

library(xml2)
library(dplyr)
#read page and parent nodes
page <- read_xml(test.xml)
items <- page %>% xml_find_all("item")

#get title and creator (assuming 1 per parent)
title <- items %>% xml_find_first("title") %>% xml_text()
creator <- items %>% xml_find_first("creator") %>% xml_text()

#find the multip;e tag and store nodes per parent
#collapse the multiples into 1 value
dfs <- lapply(items, function(node){
tag <- node %>% xml_find_all(xpath='.//category[@domain="tag"]') %>% xml_text()
tag <- paste(tag, collapse = ", ")

store <- node %>% xml_find_all(xpath='.//category[@domain="store"]') %>% xml_text()
store <- paste(store, collapse = ", ")

data.frame(tag, store)
})

#combine everything into 1 data frame
finalanswer <- data.frame(title, creator, bind_rows(dfs))

Parse xml to dataframe including children and attributes in R

Since you use xml2 and require various data nodes that differ across nested levels, consider XSLT, the special-purpose language (like SQL) designed to transform XML files. In R, the xslt package, sister module to xml2, can run XSLT 1.0 scripts. The recursive, template nature of XSLT helps avoid complex nested loops or mapping at application layer, here being R. Plus XSLT is portable (like SQL) and can be run outside of R.

While this may be a whole new concept out of left field requiring a learning curve, it cleanly flattens your XML to the 2-D structure needed for data sets. You also separate XML handling (XSLT) from data handling (R). Specifically, only Player level is retained with respective Team data migrated down (see demo).

XSLT (save as .xsl, a special .xml file)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>

<xsl:template match="/SoccerFeed|SoccerDocument">
<xsl:apply-templates select="*"/>
</xsl:template>

<xsl:template match="Team">
<xsl:apply-templates select="Player"/>
</xsl:template>

<xsl:template match="Team/@*">
<xsl:element name="{concat('team_', name(.))}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:template>

<xsl:template match="Player">
<xsl:copy>
<xsl:apply-templates select="ancestor::Team/@*"/>
<xsl:copy-of select="Name|Position"/>
<xsl:apply-templates select="@*|Stat"/>
</xsl:copy>
</xsl:template>

<xsl:template match="Player/@*">
<xsl:element name="{name(.)}">
<xsl:value-of select="."/>
</xsl:element>
</xsl:template>

<xsl:template match="Stat">
<xsl:element name="{@Type}">
<xsl:value-of select="text()"/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>

Online Demo

R (results in data frame of all character types)

library(xml2)
library(xslt)
library(dplyr)

# INPUT SOURCE
doc <- read_xml("/path/to/Input.xml")
style <- read_xml("/path/to/Style.xsl", package = "xslt")

# TRANSFORM
new_xml <- xml_xslt(doc, style)

# RETRIEVE Player NODES
recs <- xml_find_all(new_xml, "//Player")

# BIND EACH CHILD TEXT AND NAME TO Player DFs
df_list <- lapply(recs, function(r)
data.frame(rbind(setNames(xml_text(xml_children(r)),
xml_name(xml_children(r)))),
stringsAsFactors = FALSE)
)

# BIND ALL DFs TO SINGLE MASTER DF
final_df <- bind_rows(df_list)

Converting a nested XML document to a data frame, when information is both in attribute and text nodes

Consider binding attributes with the internal method, xmlAttrsToDataFrame, and elements with xmlToDataFrame, assuming only one set of user and sibling tags per record.

library(XML)
...

# BIND ATTRIBUTES AND ELEMENTS
record_df <- cbind(XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//record')),
XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//user')),
xmlToDataFrame(doc, nodes = getNodeSet(doc, "//record"))
)

# RENAME COLUMNS
record_df <- setNames(record_df, c("record_id", "record_size", "user_id", "user_origin",
"record_user", "record_category", "record_rating", "record_text"))

record_df
# record_id record_size user_id user_origin record_user record_category record_rating record_text
# 1 512 1 8412 ab Certificates 80 \nLorem ipsum dolor ...
# 2 452 2 7623 bb Certificates 70 \nUt enim ad minim ...

Parse XML files with subnodes of the same name

Here is a solution based on this question/answer: R XML - combining parent and child nodes(w same name) into data frame

library(xml2)
library(dplyr)
page<-read_xml('<resultset>
<row>
<column name="indexpatient">2</column>
<column name="height" null="true"></column>
<column name="ParameterMeasure">Cardiac/MM/Dimension/LVIDd</column>
<column name="ParameterId">MM/LVIDd</column>
<column name="ResultIdentifier">Average</column>
<column name="ResultValue">0.05617021151</column>
</row>
<row>
<column name="indexpatient">2</column>
<column name="height" null="true"></column>
<column name="ParameterMeasure">Cardiac/MM/Dimension/LVIDd</column>
<column name="ParameterId">MM/LVIDs</column>
<column name="ResultIdentifier">Measurement No. 1</column>
<column name="ResultValue">0.05341702</column>
</row>
</resultset>')

rows<- page %>% xml_find_all('//row')

dfs<-lapply(rows, function(node){
#find the attr value from all child nodes
names<-node %>% xml_children() %>% xml_attr("name")
#find all values
values<-node %>% xml_children() %>% xml_text()

#create data frame and properly label the columns
df<-data.frame(t(values), stringsAsFactors = FALSE)
names(df)<-names
df
})

#bind together and add uid to final dataframe.
answer<-bind_rows(dfs)
answer

# indexpatient height ParameterMeasure ParameterId ResultIdentifier ResultValue
# 1 2 Cardiac/MM/Dimension/LVIDd MM/LVIDd Average 0.05617021151
# 2 2 Cardiac/MM/Dimension/LVIDd MM/LVIDs Measurement No. 1 0.05341702
>

XML nodes to R data frame with all higher-level node attributes as columns

You could do this:

library(xml2)
library(purrr)
library(readr)
library(rvest)
library(tibble)

individuals <- read_xml('test.xml') %>%
xml_find_all('//individual')

to_add <- function(individual, xpath) individual %>%
html_nodes(xpath = xpath) %>%
{list(html_text(.), html_name(.))} %>%
{setNames(object = .[[1]], nm = .[[2]])}

get_data <- function(individual){

out <- c(
individual %>% html_attrs(),
individual %>% html_nodes(xpath = "..") %>% html_attrs() %>% unlist,
individual %>% html_nodes(xpath = "../..") %>% html_attrs() %>% unlist,
individual %>% html_nodes(xpath = "../../..") %>% html_attrs() %>% unlist
)

xpathes <- c("../../*[not(descendant::*)]", "../*[not(descendant::*)]", "*")

c(sapply(xpathes, to_add, individual = individual, USE.NAMES = FALSE) %>% unlist, out)
}

And then:

lapply(individuals, get_data) %>% 
do.call(what = cbind) %>%
as.tibble

Melting repeated child nodes XML into a tidy data set using R

The host, ip, hostname, etc will be repeated when you add the fndvuln elements to your data.frame (try data.frame("a", 1:3))

x <- lapply(vuln, function(x)  data.frame(
host = xpathSApply(x, "." , xmlGetAttr, "id"),
ip = xpathSApply(x, ".//ip", xmlValue),
hostname = xpathSApply(x, ".//hostname", xmlValue),
VulnID = xpathSApply(x, ".//fndvuln" , xmlGetAttr, "id"),
port = xpathSApply(x, ".//fndvuln" , xmlGetAttr, "port") ))

do.call("rbind", x)
host ip hostname VulnID port
1 169274 some_IP_here Some_DNS_name_here 534 80
2 169274 some_IP_here Some_DNS_name_here 1191 22
3 169275 some_IP_here Some_DNS_name_here 5452 ip
4 169275 some_IP_here Some_DNS_name_here 5092 123
5 169275 some_IP_here Some_DNS_name_here 16157 123

xml with nested siblings to data frame in R

Considering xml holds your example string, here's another strategy for Residential Properties with a varying number of items:

library(XML)
library(plyr)
# xml <- '<ResidentialProperty>........'
doc <- xmlParse(xml, asText = TRUE)
df <- do.call(rbind.fill, lapply(doc['//ResidentialProperty'], function(x) {
names <- xpathSApply(x, './/.', xmlName)
names <- names[which(names == "text") - 1]
values <- xpathSApply(x, ".//text()", xmlValue)
return(as.data.frame(t(setNames(values, names)), stringsAsFactors = FALSE))
}))
df
# StreetNumber StreetName StreetSuffix StateOrProvince StatusChangeDate Latitude Longitude County SchoolDistrict View YearBuilt InteriorFeatures Name Roof Exterior
# 1 11111 111th Avenue Ct WA 2015-07-05T23:48:53.410 11.111111 -111.111111 Pierce Puyallup Territorial 1997 Bath Off Master,Dbl Pane/Storm Windw Vacant Composition Brick,Cement Planked,Wood,Wood Products


Related Topics



Leave a reply



Submit