Combining Multiple Identically-Named Columns in R

Combining Multiple Identically-Named Columns in R

As suggested by @VincentZoonekynd it is not a good idea to have multiple columns with the same name.

Anyway, you could do in this way:

df <- data.frame(A=c(1,4,8),B=c(2,3,7),C=c(8,2,1),D=c(2,9,3),E=c(3,9,8))
names(df) <- c('123','256', '192', '123', '256')

df <- t(df) # transpose the data.frame
aggr <- by(df, INDICES=row.names(df), FUN=colSums) # collapse the rows with the same name
aggr <- as.data.frame(do.call(cbind,aggr)) # convert by() result to a data.frame

or, in one line:

aggr <- as.data.frame(do.call(cbind, by(t(df),INDICES=names(df),FUN=colSums)))

merge/combine columns with same name but incomplete data

Here's an approach that involves melting your data, merging the molten data, and using dcast to get it back to a wide form. I've added comments to help understand what is going on.

## Required packages
library(data.table)
library(reshape2)

dcast.data.table(
merge(
## melt the first data.frame and set the key as ID and variable
setkey(melt(as.data.table(df1), id.vars = "ID"), ID, variable),
## melt the second data.frame
melt(as.data.table(df2), id.vars = "ID"),
## you'll have 2 value columns...
all = TRUE)[, value := ifelse(
## ... combine them into 1 with ifelse
is.na(value.x), value.y, value.x)],
## This is your reshaping formula
ID ~ variable, value.var = "value")
# ID hello world football baseball hockey soccer
# 1: 1 2 3 43 6 7 4
# 2: 2 5 1 24 32 2 5
# 3: 3 10 8 2 23 8 23
# 4: 4 4 17 5 15 5 12
# 5: 5 9 7 12 23 3 43

Combining columns based on partial column name

Suppose your dataframe DF looks like this

  OTS_SM0_1 OTS_SM0_2 OTS_SM0_3 OTS_SM0_4 OTS_SM0_5 OTS_SM0_6 OTS_SM0_7 OTS_SM0_8 OTS_SM0_9 OTS_SM1_x1_4 OTS_SM1_x1_6 OTS_SM1_x1_7 OTS_SM1_x1_8 OTS_SM1_x2_4 OTS_SM1_x2_6 OTS_SM1_x2_7 OTS_SM1_x2_8 OTS_SM1_x3_4 OTS_SM1_x3_6 OTS_SM1_x3_7 OTS_SM1_x3_8 OTS_SM1_x4_4 OTS_SM1_x4_6 OTS_SM1_x4_7 OTS_SM1_x4_8 OTS_SM1_x5_4 OTS_SM1_x5_6 OTS_SM1_x5_7 OTS_SM1_x5_8 OTS_SM1_x6_4 OTS_SM1_x6_6 OTS_SM1_x6_7 OTS_SM1_x6_8 OTS_SM1_x7_4 OTS_SM1_x7_6 OTS_SM1_x7_7 OTS_SM1_x7_8 OTS_SM1_x8_4 OTS_SM1_x8_6 OTS_SM1_x8_7 OTS_SM1_x8_8 OTS_SM1_x9_4 OTS_SM1_x9_6 OTS_SM1_x9_7 OTS_SM1_x9_8 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
2 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108

Here is a dplyr approach. We use id to preserve row relationships. After the first pivot, we remove the characters followed by the last "_" only if they are digits. Last, we summarize the rows into a single cell for each group of id and variable (name in this case) and convert the dataframe from long to wide.

library(dplyr)
library(tidyr)

DF %>%
mutate(id = row_number()) %>%
pivot_longer(-id) %>%
group_by(id, name = sub("(_\\d+)?$", "", name)) %>%
summarize(value = paste0(value, collapse = ";"), .groups = "drop") %>%
pivot_wider() %>%
select(-id)

Output (I used a different print method to show you all the columns. The default print method may render something different on your screen, but the underlying object is the same.)

                 OTS_SM0  OTS_SM1_x1  OTS_SM1_x2  OTS_SM1_x3  OTS_SM1_x4  OTS_SM1_x5  OTS_SM1_x6  OTS_SM1_x7  OTS_SM1_x8  OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1;2;3;4;5;6;7;8;9 10;11;12;13 14;15;16;17 18;19;20;21 22;23;24;25 26;27;28;29 30;31;32;33 34;35;36;37 38;39;40;41 42;43;44;45 46 47 48 49 50 51 52 53 54
2 2;4;6;8;10;12;14;16;18 20;22;24;26 28;30;32;34 36;38;40;42 44;46;48;50 52;54;56;58 60;62;64;66 68;70;72;74 76;78;80;82 84;86;88;90 92 94 96 98 100 102 104 106 108

However, if you want to do some analysis of the variables, having values nested in a single cell is usually not helpful. Perhaps you would also like to consider a slightly different representation of the data.

library(dplyr)
library(tidyr)

DF %>%
rename_with(~sub("(_\\d+)?$", "`\\1", .)) %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_to = c(".value", NA), names_pattern = "(.+)`(_\\d+)?")

, which gives

   id OTS_SM0 OTS_SM1_x1 OTS_SM1_x2 OTS_SM1_x3 OTS_SM1_x4 OTS_SM1_x5 OTS_SM1_x6 OTS_SM1_x7 OTS_SM1_x8 OTS_SM1_x9 OTS_SM2_x1 OTS_SM2_x2 OTS_SM2_x3 OTS_SM2_x4 OTS_SM2_x5 OTS_SM2_x6 OTS_SM2_x7 OTS_SM2_x8 OTS_SM2_x9
1 1 1 10 14 18 22 26 30 34 38 42 46 47 48 49 50 51 52 53 54
2 1 2 11 15 19 23 27 31 35 39 43 NA NA NA NA NA NA NA NA NA
3 1 3 12 16 20 24 28 32 36 40 44 NA NA NA NA NA NA NA NA NA
4 1 4 13 17 21 25 29 33 37 41 45 NA NA NA NA NA NA NA NA NA
5 1 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
6 1 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
7 1 7 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
8 1 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
9 1 9 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
10 2 2 20 28 36 44 52 60 68 76 84 92 94 96 98 100 102 104 106 108
11 2 4 22 30 38 46 54 62 70 78 86 NA NA NA NA NA NA NA NA NA
12 2 6 24 32 40 48 56 64 72 80 88 NA NA NA NA NA NA NA NA NA
13 2 8 26 34 42 50 58 66 74 82 90 NA NA NA NA NA NA NA NA NA
14 2 10 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
15 2 12 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
16 2 14 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
17 2 16 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
18 2 18 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

Data

structure(list(OTS_SM0_1 = c(1, 2), OTS_SM0_2 = c(2, 4), OTS_SM0_3 = c(3, 
6), OTS_SM0_4 = c(4, 8), OTS_SM0_5 = c(5, 10), OTS_SM0_6 = c(6,
12), OTS_SM0_7 = c(7, 14), OTS_SM0_8 = c(8, 16), OTS_SM0_9 = c(9,
18), OTS_SM1_x1_4 = c(10, 20), OTS_SM1_x1_6 = c(11, 22), OTS_SM1_x1_7 = c(12,
24), OTS_SM1_x1_8 = c(13, 26), OTS_SM1_x2_4 = c(14, 28), OTS_SM1_x2_6 = c(15,
30), OTS_SM1_x2_7 = c(16, 32), OTS_SM1_x2_8 = c(17, 34), OTS_SM1_x3_4 = c(18,
36), OTS_SM1_x3_6 = c(19, 38), OTS_SM1_x3_7 = c(20, 40), OTS_SM1_x3_8 = c(21,
42), OTS_SM1_x4_4 = c(22, 44), OTS_SM1_x4_6 = c(23, 46), OTS_SM1_x4_7 = c(24,
48), OTS_SM1_x4_8 = c(25, 50), OTS_SM1_x5_4 = c(26, 52), OTS_SM1_x5_6 = c(27,
54), OTS_SM1_x5_7 = c(28, 56), OTS_SM1_x5_8 = c(29, 58), OTS_SM1_x6_4 = c(30,
60), OTS_SM1_x6_6 = c(31, 62), OTS_SM1_x6_7 = c(32, 64), OTS_SM1_x6_8 = c(33,
66), OTS_SM1_x7_4 = c(34, 68), OTS_SM1_x7_6 = c(35, 70), OTS_SM1_x7_7 = c(36,
72), OTS_SM1_x7_8 = c(37, 74), OTS_SM1_x8_4 = c(38, 76), OTS_SM1_x8_6 = c(39,
78), OTS_SM1_x8_7 = c(40, 80), OTS_SM1_x8_8 = c(41, 82), OTS_SM1_x9_4 = c(42,
84), OTS_SM1_x9_6 = c(43, 86), OTS_SM1_x9_7 = c(44, 88), OTS_SM1_x9_8 = c(45,
90), OTS_SM2_x1 = c(46, 92), OTS_SM2_x2 = c(47, 94), OTS_SM2_x3 = c(48,
96), OTS_SM2_x4 = c(49, 98), OTS_SM2_x5 = c(50, 100), OTS_SM2_x6 = c(51,
102), OTS_SM2_x7 = c(52, 104), OTS_SM2_x8 = c(53, 106), OTS_SM2_x9 = c(54,
108)), row.names = c(NA, -2L), class = "data.frame")

R merge columns in the same file with identical column names and remove duplicates

rm(list = ls())
cat("\014")
dev.off()

dd <- read.table("~/Downloads/test.txt" , header = F, sep = "\t")

names(dd) <- as.matrix(dd[1, ])
dd <- dd[-1, ]

oo <- lapply(unique(colnames(dd)) , function(colName){
i <- colName
colNu <- which(colnames(dd) == i)
dim(dd[,colNu])
unique(as.character(unlist(dd[,colNu])))

})

names(oo) <- unique(colnames(dd))


Related Topics



Leave a reply



Submit