How to Split Data Frame by Column Names in R

split data frame with recurring column names

One way would be to find the name of the column in the dataframe and split based on that. We can use group_split to split data into dataframes, we use map_at to remove the 1st row from each list since it is the column name and use type.convert to convert columns into it's respective classes.

library(dplyr)
library(purrr)

temp <- df %>%
group_split(id = cumsum(A == names(.)[1]) + 1) %>%
map_at(-1, tail, -1) %>%
map(type.convert)
temp

#[[1]]
# A tibble: 3 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.668 0.411 0.553 0.477 1
#2 0.794 0.821 0.530 0.732 1
#3 0.108 0.647 0.789 0.693 1

#[[2]]
# A tibble: 6 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.724 0.783 0.023 0.478 2
#2 0.861 0.099 0.407 0.332 2
#3 0.438 0.316 0.913 0.651 2
#4 0.245 0.519 0.294 0.258 2
#5 0.07 0.662 0.459 0.479 2
#6 0.766 0.839 0.892 0.961 2

#[[3]]
# A tibble: 4 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.084 0.347 0.864 0.435 3
#2 0.875 0.334 0.39 0.713 3
#3 0.339 0.476 0.777 0.4 3
#4 0.084 0.347 0.864 0.435 3

Using the same logic in base R, we can do

df$id <- cumsum(df$A == names(df)[1]) + 1
temp <- split(df, df$id)
temp[-1] <- lapply(temp[-1], tail, -1)
temp <- lapply(temp, type.convert)

If you need them as separate dataframes do,

names(temp) <- paste0("df", seq_along(temp))
list2env(temp, .GlobalEnv)

data

df <- structure(list(A = structure(c(7L, 10L, 3L, 13L, 8L, 11L, 6L, 
4L, 1L, 9L, 13L, 2L, 12L, 5L, 2L), .Label = c("0.070", "0.084",
"0.108", "0.245", "0.339", "0.438", "0.668", "0.724", "0.766",
"0.794", "0.861", "0.875", "A"), class = "factor"), B = structure(c(5L,
11L, 8L, 13L, 10L, 1L, 2L, 7L, 9L, 12L, 13L, 4L, 3L, 6L, 4L), .Label = c("0.099",
"0.316", "0.334", "0.347", "0.411", "0.476", "0.519", "0.647",
"0.662", "0.783", "0.821", "0.839", "B"), class = "factor"),
C = structure(c(7L, 6L, 9L, 13L, 1L, 4L, 12L, 2L, 5L, 11L,
13L, 10L, 3L, 8L, 10L), .Label = c("0.023", "0.294", "0.390",
"0.407", "0.459", "0.530", "0.553", "0.777", "0.789", "0.864",
"0.892", "0.913", "C"), class = "factor"), D = structure(c(5L,
11L, 9L, 13L, 6L, 2L, 8L, 1L, 7L, 12L, 13L, 4L, 10L, 3L,
4L), .Label = c("0.258", "0.332", "0.400", "0.435", "0.477",
"0.478", "0.479", "0.651", "0.693", "0.713", "0.732", "0.961",
"D"), class = "factor")), class = "data.frame", row.names = c(NA, -15L))

Split Data Frame Into N Data Frames Based On Column Names

We can use split.default on the substring of names of the dataset

split.default(df1, sub("-.*", "", names(df1)))

Or if we know there would be only one character before -

split.default(df1, substr(names(df1), 1, 1))
#$A
# A-DIODE A-DIODE.1
#1 1.2 1.3
#2 0.4 0.6

#$B
# B-DIODE B-ACC1 B-ACC2 B-ANA0 B-ANA1 B-BRICKID B-CC0 B-CC1 B-DIGDN B-DIGDP B-DN1 B-DN2 B-DP1 B-DP2 B-SCL B-SDA B-USB0DN B-USB0DP
#1 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3 3.1
#2 0.8 1.0 1.2 1.4 1.6 1.8 2 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4 4.2
# B-USB1DN B-USB1DP B-ACC1.1 B-ACC2.1 B-ANA0.1 B-ANA1.1 B-BRICKID.1 B-CC0.1 B-CC1.1 B-DIGDN.1 B-DIGDP.1 B-DN1.1 B-DN2.1 B-DP1.1 B-DP2.1
#1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1 4.2 4.3 4.4 4.5 4.6
#2 4.4 4.6 4.8 5.0 5.2 5.4 5.6 5.8 6 6.2 6.4 6.6 6.8 7.0 7.2
# B-SCL.1 B-SDA.1 B-USB0DN.1 B-USB0DP.1 B-USB1DN.1 B-USB1DP.1 B-NA B-ACC2PWRLKG_0v4 B-ACC2PWRLKG_0v4.1 B-P_IN_Leak
#1 4.7 4.8 4.9 5 5.1 5.2 5.3 5.4 5.5 5.6
#2 7.4 7.6 7.8 8 8.2 8.4 8.6 8.8 9.0 9.2

Splitting a dataframe into multiple dataframes based on the column name in R

You could use split.default

split.default(df, c(0, cumsum(grepl("^REG", names(df)[-ncol(df)]))))

$`0`
a b c REG01
x 2844.8 1430.9 906.2 1871.0
y 10232.5 29263.6 6019.1 69618.7
z 20150.6 26334.5 6848.6 45032.2

$`1`
d e REG02
x 2106.0 1818.8 1364.5
y 29929.6 232371.1 57561.7
z 58626.1 42713.6 20656.4

$`2`
f REG03
x 520.4 1821.4
y 46754.9 43862.3
z 9036.9 51876.1

R - split dataframe into list by column while retaining a second column, then rename list elements by col name

Get the dataframe in long format and use split

library(tidyverse)

df %>%
pivot_longer(cols = starts_with('score')) %>%
split(.$name) %>%
map(~.x %>% select(-name))

#$score1
# A tibble: 6 x 2
# state value
# <chr> <dbl>
#1 a 1.58
#2 a 0.567
#3 b -0.313
#4 b 0.756
#5 c 0.236
#6 c 1.05

#$score2
# A tibble: 6 x 2
# state value
# <chr> <dbl>
#1 a 9.93
#2 a 9.96
#3 b 12.2
#4 b 9.41
#5 c 9.40
#6 c 9.97

You can also use group_split and avoid the map step but it doesn't give the list names (score1, score2) in the output.

df %>%
pivot_longer(cols = starts_with('score')) %>%
group_split(name, .keep = FALSE)

Match column names with another dataframe and split into separate dataframes

Try next code:

library(dplyr)
library(tidyr)
#Code
data <- df %>% pivot_longer(everything()) %>%
left_join(keys,by = c('name'='sample_name'))
#Split
List <- split(data,data$site_name)
List <- lapply(List,function(x) {x$site_name<-NULL;x})
list2env(List,envir = .GlobalEnv)

Outputs:

List
$chic_1
# A tibble: 2 x 2
name value
<chr> <dbl>
1 chic56.345 0.6
2 chic56.345 1.2

$tex_1
# A tibble: 2 x 2
name value
<chr> <dbl>
1 tex21.222 0.5
2 tex21.222 0.8

$wa_1
# A tibble: 2 x 2
name value
<chr> <dbl>
1 wa34.907 1.12
2 wa34.907 0.9

How to split a list of data frames based on its column names?

Create groups that you get by matching the names with match(), then use split().

f <- sapply(mylist, function(x) length(na.omit(match(names(x), names))))
listNew <- setNames(split(mylist, f), c("listB", "listA"))

Yielding

> str(listNew)
List of 2
$ listB:List of 1
..$ :'data.frame': 15 obs. of 3 variables:
.. ..$ v1 : int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ v2 : int [1:15] 20 21 22 23 24 25 26 27 28 29 ...
.. ..$ v3b: Factor w/ 15 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10 ...
$ listA:List of 2
..$ :'data.frame': 15 obs. of 3 variables:
.. ..$ v1: int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ v2: int [1:15] 20 21 22 23 24 25 26 27 28 29 ...
.. ..$ v3: Factor w/ 15 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ :'data.frame': 15 obs. of 3 variables:
.. ..$ v1: int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ v2: int [1:15] 20 21 22 23 24 25 26 27 28 29 ...
.. ..$ v3: Factor w/ 15 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10 ...

R split column names with different occurrences of delimiter into strings and assign unique strings/string counts to a new dataframe

I think if you split at the "underscore, digit, underscore" it provides a solution to your statement above. This does eliminate the digit and the associated information. Does this matter?

names <- c("strainA_1_batch1", "strainA_2_batch2", "strainB_1_batch1", "strainC_1_batch2", "strainC_2_batch2", 
"strainD_a_1_batch1", "strainD_b_1_batch1")

#split at the underscore, digit and underscore
splitList <- strsplit(names, "_\\d_")

#convert to dataframe
df <-data.frame(t(as.data.frame.list(splitList)))

#clean up data.frame
rownames(df)<-NULL
names(df)<-c("Strain", "Batch")
df

#report
table(df$Strain)
table(df$Batch)

Another option is to replace the underscore on either side of the digit with a " " (or other character) and then split on the space.

names<-gsub("_(\\d)_", " \\1 ", names)


Related Topics



Leave a reply



Submit