R Xml - Combining Parent and Child Nodes into Data Frame

R XML - combining parent and child nodes(w same name) into data frame

Here is a solution to try. See comments for an explanation of the process steps:

library(xml2)
library(dplyr)

x <- read_xml('player.xml')

Players3 <- x %>% xml_find_all('//Player') 

dfs<-lapply(Players3, function(node){
   #find names of all children nodes
   childnodes<-node %>% xml_children() %>% xml_name()
   #find the attr value from all child nodes
   names<-node %>% xml_children() %>% xml_attr("Type")
   #create columns names based on either node name or attr value
   names<-ifelse(is.na(names), childnodes, names)

   #find all values
   values<-node %>% xml_children() %>% xml_text()

   #create data frame and properly label the columns
   df<-data.frame(t(values), stringsAsFactors = FALSE)
   names(df)<-names
   df
})

#bind together and add uid to final dataframe.
answer<-bind_rows(dfs)
answer$UID<- Players3 %>% xml_attr("uID")
answer

#             Name   Position first_name last_name birth_date weight height jersey_num real_position
# 1   Denis Petric Goalkeeper      Denis    Petric 1988-05-24     83    187          1    Goalkeeper
# 2 Mathieu Michel Goalkeeper    Mathieu    Michel 1991-09-04     84    189          1    Goalkeeper
#   real_position_side  join_date  country birth_place first_nationality preferred_foot     UID
# 1            Unknown 2016-01-02 Slovenia        <NA>              <NA>           <NA>  p40511
# 2            Unknown 2016-08-18   France       Nimes            France          Right p119744

Parse xml and combine child attribute

Here is a solution using the xml2 package. It is straight forward, read the "item" parent nodes, and parse out the title and creator. Then using lapply process each parent node to parse and merge the multiple child nodes together. Finally merger everything together.

library(xml2)
library(dplyr)
#read page and parent nodes
page <- read_xml(test.xml)
items <- page %>% xml_find_all("item")

#get title and creator (assuming 1 per parent)
title <- items %>% xml_find_first("title") %>% xml_text()
creator <- items %>% xml_find_first("creator") %>% xml_text()

#find the multip;e tag and store nodes per parent
#collapse the multiples into 1 value
dfs <- lapply(items, function(node){
   tag <- node %>% xml_find_all(xpath='.//category[@domain="tag"]') %>% xml_text()
  tag <- paste(tag, collapse = ", ")
  
 store <- node %>% xml_find_all(xpath='.//category[@domain="store"]') %>% xml_text()
 store <- paste(store, collapse = ", ")
 
 data.frame(tag, store)
})

#combine everything into 1 data frame
finalanswer <- data.frame(title, creator, bind_rows(dfs))

XML nodes to R data frame with all higher-level node attributes as columns

You could do this:

library(xml2)
library(purrr)
library(readr)
library(rvest)
library(tibble)

individuals <- read_xml('test.xml') %>% 
  xml_find_all('//individual')

to_add <- function(individual, xpath) individual %>% 
  html_nodes(xpath = xpath) %>% 
  {list(html_text(.), html_name(.))} %>% 
  {setNames(object = .[[1]], nm = .[[2]])}

get_data <- function(individual){
  
  out <- c(
    individual %>% html_attrs(),
    individual %>% html_nodes(xpath = "..") %>% html_attrs() %>% unlist,
    individual %>% html_nodes(xpath = "../..") %>% html_attrs() %>% unlist,
    individual %>% html_nodes(xpath = "../../..") %>% html_attrs() %>% unlist
  )
  
  xpathes <- c("../../*[not(descendant::*)]", "../*[not(descendant::*)]", "*")
  
  c(sapply(xpathes, to_add, individual = individual, USE.NAMES = FALSE) %>% unlist, out)
}

And then:

lapply(individuals, get_data) %>% 
  do.call(what = cbind) %>% 
  as.tibble

Parse xml to dataframe including children and attributes in R

Since you use xml2 and require various data nodes that differ across nested levels, consider XSLT, the special-purpose language (like SQL) designed to transform XML files. In R, the xslt package, sister module to xml2, can run XSLT 1.0 scripts. The recursive, template nature of XSLT helps avoid complex nested loops or mapping at application layer, here being R. Plus XSLT is portable (like SQL) and can be run outside of R.

While this may be a whole new concept out of left field requiring a learning curve, it cleanly flattens your XML to the 2-D structure needed for data sets. You also separate XML handling (XSLT) from data handling (R). Specifically, only Player level is retained with respective Team data migrated down (see demo).

XSLT (save as .xsl, a special .xml file)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output indent="yes"/>
  <xsl:strip-space elements="*"/>

  <xsl:template match="/SoccerFeed|SoccerDocument">
      <xsl:apply-templates select="*"/>
  </xsl:template>

  <xsl:template match="Team">
      <xsl:apply-templates select="Player"/>
  </xsl:template>

  <xsl:template match="Team/@*">
    <xsl:element name="{concat('team_', name(.))}">
      <xsl:value-of select="."/>      
    </xsl:element>
  </xsl:template>

  <xsl:template match="Player">
    <xsl:copy>
      <xsl:apply-templates select="ancestor::Team/@*"/>
      <xsl:copy-of select="Name|Position"/>
      <xsl:apply-templates select="@*|Stat"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="Player/@*">
    <xsl:element name="{name(.)}">
      <xsl:value-of select="."/>      
    </xsl:element>
  </xsl:template>

  <xsl:template match="Stat">
    <xsl:element name="{@Type}">
      <xsl:value-of select="text()"/>     
    </xsl:element>
  </xsl:template>
</xsl:stylesheet>

Online Demo

R (results in data frame of all character types)

library(xml2)
library(xslt)
library(dplyr)

# INPUT SOURCE
doc <- read_xml("/path/to/Input.xml")
style <- read_xml("/path/to/Style.xsl", package = "xslt")

# TRANSFORM 
new_xml <- xml_xslt(doc, style)

# RETRIEVE Player NODES
recs <- xml_find_all(new_xml, "//Player")

# BIND EACH CHILD TEXT AND NAME TO Player DFs
df_list <- lapply(recs, function(r) 
    data.frame(rbind(setNames(xml_text(xml_children(r)), 
                              xml_name(xml_children(r)))),
               stringsAsFactors = FALSE)
)

# BIND ALL DFs TO SINGLE MASTER DF
final_df <- bind_rows(df_list)

Converting a nested XML document to a data frame, when information is both in attribute and text nodes

Consider binding attributes with the internal method, xmlAttrsToDataFrame, and elements with xmlToDataFrame, assuming only one set of user and sibling tags per record.

library(XML)
...

# BIND ATTRIBUTES AND ELEMENTS
record_df <- cbind(XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//record')),
                   XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//user')),
                   xmlToDataFrame(doc, nodes = getNodeSet(doc, "//record"))
             )    

# RENAME COLUMNS
record_df <- setNames(record_df, c("record_id", "record_size", "user_id", "user_origin",
                                   "record_user", "record_category", "record_rating", "record_text"))

record_df
#   record_id record_size user_id user_origin record_user record_category record_rating              record_text
# 1       512           1    8412          ab                Certificates            80  \nLorem ipsum dolor ...
# 2       452           2    7623          bb                Certificates            70   \nUt enim ad minim ...

Melting repeated child nodes XML into a tidy data set using R

The host, ip, hostname, etc will be repeated when you add the fndvuln elements to your data.frame (try data.frame("a", 1:3))

x <- lapply(vuln, function(x)  data.frame(
    host = xpathSApply(x, "." , xmlGetAttr, "id"),
     ip  = xpathSApply(x, ".//ip", xmlValue),
hostname = xpathSApply(x, ".//hostname", xmlValue),
  VulnID = xpathSApply(x, ".//fndvuln" , xmlGetAttr, "id"),
   port  = xpathSApply(x, ".//fndvuln" , xmlGetAttr, "port") ))

do.call("rbind", x)
    host           ip            hostname VulnID port
1 169274 some_IP_here Some_DNS_name_here     534   80
2 169274 some_IP_here Some_DNS_name_here    1191   22
3 169275 some_IP_here Some_DNS_name_here    5452   ip
4 169275 some_IP_here Some_DNS_name_here    5092  123
5 169275 some_IP_here Some_DNS_name_here   16157  123

xml with nested siblings to data frame in R

Considering xml holds your example string, here's another strategy for Residential Properties with a varying number of items:

library(XML)
library(plyr) 
# xml <- '<ResidentialProperty>........'
doc <- xmlParse(xml, asText =  TRUE)
df <- do.call(rbind.fill, lapply(doc['//ResidentialProperty'], function(x) { 
  names <- xpathSApply(x, './/.', xmlName) 
  names <- names[which(names == "text") - 1]
  values <- xpathSApply(x, ".//text()", xmlValue)
  return(as.data.frame(t(setNames(values, names)), stringsAsFactors = FALSE))
}))
df
#   StreetNumber StreetName StreetSuffix StateOrProvince        StatusChangeDate  Latitude   Longitude County SchoolDistrict        View YearBuilt                     InteriorFeatures   Name        Roof                                Exterior
# 1        11111      111th    Avenue Ct              WA 2015-07-05T23:48:53.410 11.111111 -111.111111 Pierce       Puyallup Territorial      1997 Bath Off Master,Dbl Pane/Storm Windw Vacant Composition Brick,Cement Planked,Wood,Wood Products

Data frame from multiple XML files

We can encapsulate your data gathering process into a function. Since each XML file seems to represent one row in your desired output dataframe, we can use purrr::map_dfr to construct the data rowwise. I slightly modified your code. See below:

get_row_data <- function(file_name) {
  xml <- xml2::read_xml(file_name)
  read_text <- \(x, xpath) rvest::html_elements(x, xpath = xpath) |> rvest::html_text(TRUE) 
  xpaths <- c(
    ACTR_Number = "/ANZCTR_Trial/actrnumber", 
    primary_sponsor_type = "/ANZCTR_Trial/sponsorship/primarysponsortype", 
    primary_sponsor_name = "/ANZCTR_Trial/sponsorship/primarysponsorname", 
    primary_sponsor_address = "/ANZCTR_Trial/sponsorship/primarysponsoraddress", 
    primary_sponsor_country = "/ANZCTR_Trial/sponsorship/primarysponsorcountry",
    funding_source_type = "/ANZCTR_Trial/sponsorship/fundingsource/fundingtype",
    funding_source_name = "/ANZCTR_Trial/sponsorship/fundingsource/fundingname",
    funding_source_address = "/ANZCTR_Trial/sponsorship/fundingsource/fundingaddress",
    funding_source_country = "/ANZCTR_Trial/sponsorship/fundingsource/fundingcountry",
    secondary_sponsor_type = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsortype",
    secondary_sponsor_name = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsorname",
    secondary_sponsor_address = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsoraddress",
    secondary_sponsor_country = "/ANZCTR_Trial/sponsorship/secondarysponsor/sponsorcountry"
  )
  x <- read_text(xml, paste0(xpaths, collapse = " | "))
  names(x) <- names(xpaths)
  as_tibble(as.list(x))
}

all_files <- list.files(pattern = ".xml", path = getwd(), full.names = TRUE)
purrr::map_dfr(all_files, get_row_data)

Output

# A tibble: 2 x 13
  ACTR_Number         primary_sponsor_~ primary_sponsor_n~ primary_sponsor_~ primary_sponsor~ funding_source_~ funding_source_~
  <chr>               <chr>             <chr>              <chr>             <chr>            <chr>            <chr>           
1 ACTRN12605000003673 Hospital          Barwon Health      "272-322 Ryrie S~ Australia        Commercial sect~ Astra Zeneca    
2 ACTRN12605000025639 Other Collaborat~ Australian Gastro~ "88 Mallett St\n~ Australia        Commercial sect~ Roche Products ~
# ... with 6 more variables: funding_source_address <chr>, funding_source_country <chr>, secondary_sponsor_type <chr>,
#   secondary_sponsor_name <chr>, secondary_sponsor_address <chr>, secondary_sponsor_country <chr>

How to parse XML attributes with parent attribute into data frame in R

The 'trick' is to get a list of alle the edge-nodes, and work with xpath from there... You can select the Trach-node from each Edge-node using the ancestor from xpath.

libraries used

#load libraries
library( xml2 )
library( magrittr )

sample data

doc <- read_xml('<?xml version="1.0" encoding="UTF-8"?>
  <TrackMate version="3.8.0">
    <Model spatialunits="µm" timeunits="sec">
      <AllTracks>
      <Track name="Track_2" TRACK_ID="2" NUMBER_SPOTS="140" NUMBER_GAPS="0" >
        <Edge SPOT_SOURCE_ID="960769" SPOT_TARGET_ID="960778" LINK_COST="0.08756957830926632" />
          <Edge SPOT_SOURCE_ID="958304" SPOT_TARGET_ID="958308" LINK_COST="1.4003359672950089" />
            <Edge SPOT_SOURCE_ID="958316" SPOT_TARGET_ID="958322" LINK_COST="1.6985623204008202" />
              </Track>
              <Track name="Track_145" TRACK_ID="145" NUMBER_SPOTS="141" NUMBER_GAPS="0" >
                <Edge SPOT_SOURCE_ID="961623" SPOT_TARGET_ID="961628" LINK_COST="2.2678642015413755" />
                  <Edge SPOT_SOURCE_ID="962122" SPOT_TARGET_ID="962127" LINK_COST="38.20777704254654" />
                    <Edge SPOT_SOURCE_ID="961869" SPOT_TARGET_ID="961873" LINK_COST="0.2895609647324684" />
                      </Track>
                      </AllTracks>
                      </Model>
                      </TrackMate>')

code

#find all edge nodes
edge.nodes <- xml_find_all( doc, ".//Edge")
#build the data.frame
data.frame( TRACK_ID = xml_find_first( edge.nodes, ".//ancestor::Track") %>% xml_attr("TRACK_ID"),
            SPOT_SOURCE_ID = edge.nodes %>% xml_attr("SPOT_SOURCE_ID"),
            SPOT_TARGET_ID = edge.nodes %>% xml_attr("SPOT_TARGET_ID"),
            LINK_COST = edge.nodes %>% xml_attr("LINK_COST") )

output

#   TRACK_ID SPOT_SOURCE_ID SPOT_TARGET_ID           LINK_COST
# 1        2         960769         960778 0.08756957830926632
# 2        2         958304         958308  1.4003359672950089
# 3        2         958316         958322  1.6985623204008202
# 4      145         961623         961628  2.2678642015413755
# 5      145         962122         962127   38.20777704254654
# 6      145         961869         961873  0.2895609647324684