Unlisting Columns by Groups

Unlisting columns by groups

This is a perfect case for tidyr:

library(tidyr)
library(dplyr)
dat %>% unnest(logs)

Unlist data frame column preserving information from other column

Here, the idea is to first get the length of each list element using sapply and then use rep to replicate the col1 with that length

 l1 <- sapply(myDataFrame$col2, length)
unlist.col1 <- rep(myDataFrame$col1, l1)
unlist.col1
#[1] "A" "A" "A" "A" "B" "B" "B" "C" "C" "C" "C" "C" "D" "D"

Or as suggested by @Ananda Mahto, the above could be also done with vapply

   with(myDataFrame, rep(col1, vapply(col2, length, 1L)))
#[1] "A" "A" "A" "A" "B" "B" "B" "C" "C" "C" "C" "C" "D" "D"

How to 'unlist' a column in a data.table

Promoting my comment to an answer. Using:

dt1[,.(colB = unlist(colB)), by = setdiff(names(dt1), 'colB')]

gives:

   colA colC colD colB
1: A1 C1 D1 B1
2: A2 C2 D2 B2a
3: A2 C2 D2 B2b
4: A3 C3 D3 B3

Or as an alternative (a slight variation of @Frank's proposal):

dt1[rep(dt1[,.I], lengths(colB))][, colB := unlist(dt1$colB)][]

Is it possible to unlist() listed data.frame while keeping other columns from data.frame?

The issue is related to df3 having a numeric for V3 while df6 is a character for V3. You can:

  1. Skip importing either df3$V3 or df6$V3
  2. Rename one of those variables

Also, to get rid of the warnings, you could create your data.frames with stringsAsFactors = FALSE or you could use tibble() instead of data.frame() as that's the default behavior of a tibble.

Edit: to better do option 2, you can use the code below to add a prefix to each variable.

my.list2 <- lapply(my.list, function(x) sapply(x, function(y) paste0(class(y), names(y))))
, function(x)
{
x%>%
rename_if(is.numeric, ~paste0('num', .x))%>%
rename_if(is.character, ~paste0('char', .x))%>%
rename_if(is.factor, ~paste0('fact', .x))
}
)

This is option 2 and it works with only the factor warnings:

df1<-data.frame(V1=c(sample(900:970,6)),
V2=c(sample(LETTERS[1:6],6)))

df2<-data.frame(V1=sample(750:780,6),
V2=sample(LETTERS[8:16],6))

df3<-data.frame(V1=sample(200:250,6),
V2=sample(LETTERS[10:20],6),
V4=sample(2300:5821,6)) #used to be V3

df4<-data.frame(V1=sample(396:480,6),
V2=sample(LETTERS,6))

df5<-data.frame(V1=sample(50:100,6),
V2=sample(LETTERS,6))

df6<-data.frame(V1=sample(200:250,6),
V2=sample(LETTERS,6),
V3=sample(letters,6))

my.list <- list(df1,df2,df3,df4,df5,df6)

mydf<-data.frame(
files=c("C:/Folder1/Data/File1.xlsx","C:/Folder1/Data/File2.xlsx",
"C:/Folder1/Data/File3.xlsx","C:/Folder2/Data/File1.xlsx",
"C:/Folder2/Data/File2.xlsx","C:/Folder2/Data/File3.xlsx"))

mydf$data<-my.list

unnest(mydf, data)

files V1 V2 V4 V3
1 C:/Folder1/Data/File1.xlsx 951 A NA <NA>
2 C:/Folder1/Data/File1.xlsx 932 F NA <NA>
3 C:/Folder1/Data/File1.xlsx 908 B NA <NA>
4 C:/Folder1/Data/File1.xlsx 953 C NA <NA>
5 C:/Folder1/Data/File1.xlsx 929 E NA <NA>
6 C:/Folder1/Data/File1.xlsx 928 D NA <NA>
7 C:/Folder1/Data/File2.xlsx 778 K NA <NA>
8 C:/Folder1/Data/File2.xlsx 771 H NA <NA>
9 C:/Folder1/Data/File2.xlsx 757 M NA <NA>
10 C:/Folder1/Data/File2.xlsx 773 P NA <NA>
11 C:/Folder1/Data/File2.xlsx 759 N NA <NA>
12 C:/Folder1/Data/File2.xlsx 765 O NA <NA>
13 C:/Folder1/Data/File3.xlsx 236 M 3964 <NA>
14 C:/Folder1/Data/File3.xlsx 214 O 5241 <NA>
...truncated

Unlist a list within a data frame

Using dplyr

The one column to multi-column splitting could be performed by first grouping by each date/julian,
splitting by comma, unlisting and conversion to data.frame row

DF=data.frame(julian=27548,pot_density=paste(c(28.4698015312665,28.4704581365093,28.4706453687),collapse=","),stringsAsFactors=FALSE)

DF

# julian pot_density
#1 27548 28.4698015312665,28.4704581365093,28.4706453687

newDF=DF %>%
dplyr::group_by(julian) %>%
dplyr::do(.,data.frame(matrix(unlist(strsplit(.$pot_density,",")),nrow=1)) ) %>%
as.data.frame()

newDF
# julian X1 X2 X3
#1 27548 28.4698015312665 28.4704581365093 28.4706453687

Updated dataset:

For your updated dataset, you can use unlist in the dplyr::do function to achieve the desired rectangular data.frame structure

Note that since the length of the 10 vectors is different you will have have NA values in some columns as a result and the number of
columns is maximum of the length of 10 vectors i.e. 85 in this case

sapply(dens$pot_density,length)
# 001 002 003 004 005 006 007 008 009 010
# 82 83 83 84 84 84 83 84 81 85

newDens=dens %>%
dplyr::group_by(julian) %>%
dplyr::do(.,data.frame(matrix(unlist(.$pot_density),nrow=1)) ) %>%
as.data.frame()

head(newDens)

# julian X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
# 1 27548 28.46980 28.47046 28.47065 28.47137 28.47160 28.47132 28.47194 28.47164 28.47194 28.47244 28.47294 28.47309 28.47288 28.47310
# 2 27555 28.38543 28.38806 28.39109 28.40103 28.41289 28.42243 28.43977 28.44522 28.44582 28.44576 28.44620 28.44655 28.44736 28.44999
# 3 27562 28.42227 28.42382 28.42443 28.42479 28.42547 28.42512 28.42507 28.42421 28.42399 28.42443 28.42443 28.42527 28.42468 28.42435
# 4 27569 28.36748 28.36746 28.36878 28.37012 28.37175 28.37264 28.37423 28.37468 28.37515 28.37497 28.37476 28.37457 28.37728 28.37809
# 5 27576 28.45268 28.45358 28.45255 28.45186 28.45280 28.45318 28.45289 28.45554 28.45886 28.46282 28.47461 28.47840 28.48306 28.48473
# 6 27583 28.38106 28.38130 28.38171 28.38312 28.38544 28.38715 28.38908 28.39021 28.39180 28.39539 28.39802 28.40041 28.40451 28.40721
# X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29
# 1 28.47376 28.47400 28.47423 28.47386 28.47432 28.47427 28.47501 28.47566 28.47508 28.47558 28.47528 28.47529 28.47534 28.47544 28.47639
# 2 28.45058 28.45241 28.45280 28.45279 28.45242 28.45243 28.45257 28.45329 28.45320 28.45371 28.45462 28.45493 28.45548 28.45603 28.45587
# 3 28.42487 28.42552 28.42493 28.42470 28.42469 28.42460 28.42477 28.42491 28.42480 28.42508 28.42372 28.42351 28.42387 28.42422 28.42406
# 4 28.37909 28.38681 28.39588 28.39948 28.39925 28.40067 28.40377 28.40711 28.40835 28.40862 28.40971 28.40957 28.40982 28.41094 28.41169
# 5 28.48765 28.48769 28.48837 28.48860 28.48843 28.48838 28.48861 28.48836 28.48877 28.48880 28.48866 28.48823 28.48845 28.48855 28.48994
# 6 28.40871 28.41264 28.41362 28.41674 28.42094 28.42703 28.43125 28.43744 28.44327 28.44861 28.45126 28.45242 28.45129 28.45123 28.45237
# X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44
# 1 28.47697 28.47747 28.47791 28.47790 28.47803 28.47818 28.47809 28.47788 28.47792 28.47816 28.47811 28.47780 28.47793 28.47787 28.47827
# 2 28.45652 28.45807 28.45933 28.45977 28.46019 28.46051 28.46100 28.46136 28.46127 28.46158 28.46191 28.46233 28.46503 28.46818 28.46907
# 3 28.42378 28.42353 28.42367 28.42346 28.42334 28.42329 28.42322 28.42341 28.42340 28.42404 28.42429 28.42447 28.42436 28.42467 28.42444
# 4 28.41317 28.41636 28.42153 28.42233 28.42240 28.42396 28.42525 28.42537 28.42547 28.42604 28.42650 28.42772 28.42909 28.43003 28.43136
# 5 28.48980 28.49084 28.49092 28.49133 28.49152 28.49214 28.49238 28.49239 28.49209 28.49234 28.49233 28.49241 28.49260 28.49329 28.49381
# 6 28.45339 28.45486 28.45551 28.45618 28.45713 28.45885 28.46112 28.46298 28.46378 28.46390 28.46541 28.46610 28.46873 28.47033 28.47103
# X45 X46 X47 X48 X49 X50 X51 X52 X53 X54 X55 X56 X57 X58 X59
# 1 28.47823 28.47792 28.47776 28.47791 28.47761 28.47762 28.47767 28.47726 28.47711 28.47687 28.47735 28.47686 28.47717 28.47754 28.47722
# 2 28.46970 28.46996 28.47060 28.47118 28.47135 28.47296 28.47468 28.47556 28.47722 28.47915 28.47984 28.48097 28.48108 28.48111 28.48129
# 3 28.42407 28.42406 28.42438 28.42469 28.42493 28.42558 28.42621 28.42715 28.42722 28.42752 28.42765 28.42871 28.43017 28.43179 28.43443
# 4 28.43191 28.43311 28.43480 28.43596 28.43673 28.43921 28.44073 28.44223 28.44265 28.44352 28.44448 28.44501 28.44557 28.44542 28.44716
# 5 28.49392 28.49363 28.49379 28.49324 28.49351 28.49468 28.49558 28.49573 28.49635 28.49718 28.49854 28.49863 28.49868 28.50034 28.50022
# 6 28.47487 28.47718 28.48008 28.48167 28.48309 28.48441 28.48660 28.48846 28.49091 28.49313 28.49472 28.49529 28.49647 28.49742 28.49801
# X60 X61 X62 X63 X64 X65 X66 X67 X68 X69 X70 X71 X72 X73 X74
# 1 28.47667 28.47667 28.47652 28.47673 28.47643 28.47634 28.47668 28.47667 28.47647 28.47646 28.47658 28.47668 28.47670 28.47662 28.47669
# 2 28.48107 28.48153 28.48151 28.48126 28.48123 28.48145 28.48140 28.48129 28.48126 28.48143 28.48146 28.48141 28.48154 28.48136 28.48149
# 3 28.43473 28.43517 28.43597 28.43618 28.43668 28.43699 28.43688 28.43797 28.43824 28.43870 28.43863 28.43913 28.44231 28.44498 28.44527
# 4 28.44915 28.45137 28.45238 28.45239 28.45248 28.45288 28.45327 28.45397 28.45462 28.45597 28.45791 28.45879 28.46073 28.46372 28.46587
# 5 28.50005 28.49965 28.50040 28.50208 28.50291 28.50319 28.50352 28.50502 28.50531 28.50528 28.50728 28.50742 28.51176 28.51329 28.51406
# 6 28.49889 28.49929 28.50054 28.50253 28.50502 28.50801 28.51164 28.51209 28.51328 28.51400 28.51412 28.51475 28.51701 28.51829 28.51830
# X75 X76 X77 X78 X79 X80 X81 X82 X83 X84 X85
# 1 28.47676 28.47684 28.47698 28.47675 28.47646 28.47656 28.47680 28.47696 NA NA NA
# 2 28.48138 28.48152 28.48168 28.48117 28.48099 28.48080 28.48079 28.48080 28.48076 NA NA
# 3 28.44555 28.44556 28.44747 28.44876 28.44821 28.44815 28.45076 28.45319 28.45599 NA NA
# 4 28.46681 28.46803 28.46890 28.46963 28.47142 28.47305 28.47342 28.47462 28.47682 28.47908 NA
# 5 28.51426 28.51492 28.51440 28.51432 28.51416 28.51403 28.51449 28.51467 28.51461 28.51462 NA
# 6 28.51946 28.52087 28.52085 28.52243 28.52380 28.52534 28.52637 28.52713 28.52725 28.52821 NA

Error when unlisting columns in a data frame

You can apply unlist to each individual element of the column numbers instead of the whole column:

DF$numbers <- lapply(DF$numbers, unlist)

DF
# letters numbers value
#1 A 1, 2 0.440
#2 B 1 0.540
#3 C 1 0.210
#4 D 2 0.102

DF$numbers[1]
#[[1]]
#[1] 1 2

Or paste the elements as a single string if you want an atomic vector column:

DF$numbers <- sapply(DF$numbers, toString)
DF
# letters numbers value
#1 A 1, 2 0.44
#2 B 1 0.54
#3 C 1 0.21
#4 D 2 0.102

DF$numbers[1]
#[1] "1, 2"

class(DF$numbers)
# [1] "character"

Unlist/unnest list column into several columns

It is easier to do this rowSums i.e. divide the 'product1' by the rowSums on the columns that starts with key word 'product'. Instead of doing rowwise with c_across, this is vectorized and should be fast as well

library(dplyr)
dat %>%
mutate(sum_product = product1/rowSums(select(., starts_with('product'))))

NOTE: There is a mixing of base R code (apply) and the tidyverse option with across which doesn't seem to be the optimal way


If we need to do this for all the 'product' columns, create a sum column first with mutate and then use across on the columns that starts with 'product' to divide the column by 'Sum_col'

dat %>%
mutate(Sum_col = rowSums(select(., starts_with('product'))),
across(starts_with('product'),
~ ./Sum_col, .names = '{.col}_sum_product')) %>%
select(-Sum_col)

-output

#ysRespNum  product1 product2 product3 product1_sum_product product2_sum_product product3_sum_product
#1 1 23.766555 13.46907 24.32327 0.3860783 0.2187998 0.3951219
#2 2 30.071773 15.98740 11.39922 0.5233660 0.2782431 0.1983909
#3 3 18.224328 11.03880 20.67063 0.3649701 0.2210688 0.4139610
#4 4 30.140839 19.78984 19.62087 0.4333597 0.2845348 0.2821054
#5 5 8.915628 30.75021 24.29150 0.1393996 0.4807925 0.3798079
#6 6 23.791981 11.14885 21.72450 0.4198684 0.1967490 0.3833826

Or using base R

nm1 <- startsWith(names(dat), 'product')
dat[paste0('sum_product', seq_along(nm1))] <- dat[nm1]/rowSums(dat[nm1])

unlist a column with a list of dates in dplyr

Coerce the result of seq.POSIXt() into a data frame and make a list of that...

x <- seq(from= as.POSIXct('2011-01-01 14:00:00'),length.out=100,by = "hour")
y <- seq(from= as.POSIXct('2012-01-01 14:00:00'),length.out=100,by = "hour")
df <- data.frame(x,y)

library(dplyr)
library(tidyr)

df %>%
rowwise() %>%
mutate(sequence = list(data.frame(seq.POSIXt(x, y, "10 min")))) %>%
unnest(sequence)

# # A tibble: 5,256,100 x 3
# x y seq.POSIXt.x..y...10.min..
# <dttm> <dttm> <dttm>
# 1 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 14:00:00
# 2 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 14:10:00
# 3 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 14:20:00
# 4 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 14:30:00
# 5 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 14:40:00
# 6 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 14:50:00
# 7 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 15:00:00
# 8 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 15:10:00
# 9 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 15:20:00
# 10 2011-01-01 14:00:00 2012-01-01 14:00:00 2011-01-01 15:30:00
# # ... with 5,256,090 more rows


Related Topics



Leave a reply



Submit