R Dataframe with Varied Column Lengths

R dataframe with varied column lengths

I'll give some little pointers here. See Tyler's answer a few questions back for a couple links to materials for getting started:
convert data.frame column format from character to factor

1) The objects you're making with c() are called vectors, and that is a particular kind of object in R- the most basic and useful kind.

2) A data.frame is a kind of list where all the elements of the list are stuck together as columns and must be of the same length. The columns can be different data types (classes)

3) lists are the most versatile kind of object in R- the elements of a list can be anything- any size, any class. This appears to be what you're asking for.

So for instance:

    mylist <- list(vec1 = c(1:10), vec2 = c(1:5))
mylist
$vec1
[1] 1 2 3 4 5 6 7 8 9 10
$vec2
[1] 1 2 3 4 5

There are different ways to get back at the elements of mylist, e.g.

    mylist$vec1
mylist[1]
mylist[[1]]
mylist["vec1"]
mylist[["vec1"]]

and likely more! Find a tutorial by searching for 'R beginner tutorial' and power through it. Have fun!

Joining different length data frames with different columns in R

Seems like you're just looking for a simple left_join. This can be done via dplyr with

left_join(df2, df1)

which will only return rows where df2 and df1 match in the timestamp column. (This drops all of the extra observations in df1).

A base R implementation is:

merge(x = df2, y = df1, by = "timestamp", all.x = TRUE)

How to add a column of different length to a data frame in R?

This is because you have duplicate values in df2. You could do:

library(tidyverse)
df1 <- tibble::tribble(
~ID, ~Borrower,
1 , "A",
2 , "A",
3 , "A"
)

df2 <- tibble::tribble(
~ID, ~Borrower, ~Category,
1L, "A", "X",
1L, "A", "X",
1L, "A", "X",
2L, "A", "X",
2L, "A", "X",
2L, "A", "X",
3L, "A", "X",
3L, "A", "X",
3L, "A", "X"
)

df1 %>%
left_join(distinct(df2))

Joining, by = c("ID", "Borrower")
# A tibble: 3 x 3
ID Borrower Category
<dbl> <chr> <chr>
1 1 A X
2 2 A X
3 3 A X

rbind a dataframe column containing list of different lengths

Update after clarification:
Another option is use stri_list2matrix from stringi, which is very fast.

library(stringi)

op <- as.data.frame(stri_list2matrix(c(IP$V1), byrow = TRUE))
op$.id <- seq_along(IP$V1)

A base R solution is to use lapply, which is also quite fast (though with a lot of variability as seen in the benchmark image).

op3 <-
as.data.frame(transpose(setDT(lapply(
c(IP$V1), "length<-", max(lengths(c(ok$V1)))
))))
op3$.id <- seq_along(IP$V1)

Another base R solution is to use sapply, which is also fairly fast (though a little slower than lapply.

op2 <- as.data.frame(t(sapply(c(IP$V1), "length<-", max(lengths(c(IP$V1))))))
op2$.id <- seq_along(IP$V1)

Output

# A tibble: 3 × 380
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 M M M M M M M M M M M M M M M M M M M
2 D D D D D D D D D D D D D D D D D D D
3 D D D D D D D D D D D D D D D D D D D
# … with 361 more variables: V20 <chr>, V21 <chr>, V22 <chr>, V23 <chr>, V24 <chr>, V25 <chr>, V26 <chr>, V27 <chr>,
# V28 <chr>, V29 <chr>, V30 <chr>, V31 <chr>, V32 <chr>, V33 <chr>, V34 <chr>, V35 <chr>, V36 <chr>, V37 <chr>,
# V38 <chr>, V39 <chr>, V40 <chr>, V41 <chr>, V42 <chr>, V43 <chr>, V44 <chr>, V45 <chr>, V46 <chr>, V47 <chr>,
# V48 <chr>, V49 <chr>, V50 <chr>, V51 <chr>, V52 <chr>, V53 <chr>, V54 <chr>, V55 <chr>, V56 <chr>, V57 <chr>,
# V58 <chr>, V59 <chr>, V60 <chr>, V61 <chr>, V62 <chr>, V63 <chr>, V64 <chr>, V65 <chr>, V66 <chr>, V67 <chr>,
# V68 <chr>, V69 <chr>, V70 <chr>, V71 <chr>, V72 <chr>, V73 <chr>, V74 <chr>, V75 <chr>, V76 <chr>, V77 <chr>,
# V78 <chr>, V79 <chr>, V80 <chr>, V81 <chr>, V82 <chr>, V83 <chr>, V84 <chr>, V85 <chr>, V86 <chr>, V87 <chr>, …

Benchmark

library (tidyverse)

bm <- microbenchmark::microbenchmark(
r2evans = {IP$V1 <- lapply(IP$V1, `length<-`, max(lengths(IP$V1)));
out2 <- data.frame(do.call(rbind, IP$V1));
out2$.id <- seq_along(IP$V1)},
RduU = {plyr::ldply(IP$V1, rbind)},
tidyr = {IP %>%
unnest_wider(V1, names_sep = "_")},
stringi = {op <- as.data.frame(stri_list2matrix(c(IP$V1), byrow=TRUE)); op$.id <- seq_along(IP$V1)},
sapply = {as.data.frame(t(sapply(c(IP$V1), "length<-", max(lengths(c(IP$V1)))))); op2$.id <- seq_along(IP$V1)},
lapply = {op3 <- as.data.frame(transpose(setDT(lapply(c(IP$V1), "length<-", max(lengths(c(ok$V1)))))));
op3$.id <- seq_along(IP$V1)},
times = 100
)

microbenchmark:::autoplot(bm)

Sample Image

Unit: microseconds
expr min lq mean median uq max neval
r2evans 1503.602 1640.0915 1799.95612 1747.6035 1872.3480 3092.314 100
RduU 1764.108 2003.0560 2150.63791 2086.5735 2232.9945 4152.803 100
tidyr 15108.671 15938.5185 17209.04116 16487.6840 17480.8740 33108.209 100
stringi 747.871 819.4205 875.45533 853.2315 913.2410 1569.510 100
sapply 1056.223 1173.0940 1294.82064 1255.7130 1337.3275 2450.791 100
lapply 939.044 1078.7225 1335.96819 1139.3605 1236.4150 13476.396 100

First Answer: You can use data.table as it will be the faster than plyr or tidyr.

library(data.table)

setDT(IP)[, list(V1 = as.character(unlist(V1)))] %>%
as.data.frame()

Benchmark

library (dplyr)

microbenchmark::microbenchmark(
data.table = setDT(IP)[, list(V1 = as.character(unlist(V1)))] %>%
as.data.frame(),
tidyr = tidyr::unnest(IP, cols = c(V1)),
plyr = plyr::ldply(IP$V1, rbind)
)

Unit: microseconds
expr min lq mean median uq max neval
data.table 588.723 679.6965 768.05463 745.360 808.5615 1465.043 100
tidyr 2631.968 2833.8095 3269.19794 3054.737 3393.4345 12726.122 100
plyr 1173.735 1290.8645 1379.57338 1335.448 1412.0445 2027.333 100

Binding dataframes with different length to an Excel file in separate columns

You can use rowr::cbind.fill:

### Creating a sample dataset ###
df1 <- data.frame(col = c(1, 2, 3))
df2 <- data.frame(col = c(5, 6))
df3 <- data.frame(col = c(4, 4, 9, 10))
df <- list(df1, df2, df3)

### Loading required library ###
library(rowr) ## Not available for R 4.0.2

### binding the columns for the list of dataframes ###
#### using do.call to apply cbind.fill on a list of dataframes
df.e <- do.call(cbind.fill, c(df, fill=NA))

### writing to csv or excel file ###
#### setting NA-string to "" to have empty cells
#### setting writing row.names to false
write.csv(df.e, "D:\\test.csv", na = "", row.names = FALSE)

Or as @akrun suggested, in base we can do something like this:

mx <- max(sapply(df, nrow)) 

do.call(cbind, lapply(df, function(x) {rbind(x, x[seq_len(mx) > nrow(x),, drop = FALSE])}))

Sample Image

Create a Data Frame of Unequal Lengths

Sorry this isn't exactly what you asked, but I think there may be another way to get what you want.

First, if the vectors are different lengths, the data isn't really tabular, is it? How about just save it to different CSV files? You might also try ascii formats that allow storing multiple objects (json, XML).

If you feel the data really is tabular, you could pad on NAs:

> x = 1:5
> y = 1:12
> max.len = max(length(x), length(y))
> x = c(x, rep(NA, max.len - length(x)))
> y = c(y, rep(NA, max.len - length(y)))
> x
[1] 1 2 3 4 5 NA NA NA NA NA NA NA
> y
[1] 1 2 3 4 5 6 7 8 9 10 11 12

If you absolutely must make a data.frame with unequal columns you could subvert the check, at your own peril:

> x = 1:5
> y = 1:12
> df = list(x=x, y=y)
> attributes(df) = list(names = names(df),
row.names=1:max(length(x), length(y)), class='data.frame')
> df
x y
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5
6 <NA> 6
7 <NA> 7
[ reached getOption("max.print") -- omitted 5 rows ]]
Warning message:
In format.data.frame(x, digits = digits, na.encode = FALSE) :
corrupt data frame: columns will be truncated or padded with NAs

Sum of columns from different data frames (different lengths) in R

Here it is shown how we could do it:

#data:
df1 <- tibble(x1 = c(1,4,4,6))
df2 <- tibble(x2 = c(1,4,4,6,6,6,8))
df3 <- tibble(x3 = c(1,4,4,6,6))

# 1. construct a list
df_list <- list(df1, df2, df3)

#install.packages("qpcR")
library(qpcR)

# 2. Use `cbind.na` from gpcR package to fill the lacking length with `NA`, so all columns have the same length:
df_result <- do.call(qpcR:::cbind.na, df_list)

# 3. Use `rowSums` to sum
df_result$final <- rowSums(df_result, na.rm = TRUE)

# 4. save as dataframe
final_x <- data.frame(final = df_result[,4])

# 5. call result:
final_x
  final
1 3
2 12
3 12
4 18
5 12
6 6
7 8

Bind different length data frames by columns from bottom

First, assess the maximum length of the columns to bind.

lenMax <- max(length(a[,1]), length(b[,1]), length(c[,1]))

Then, use this length to fill columns of a new data.frame with NAs so they fit.

data.frame(a = c(rep(NA, lenMax - length(a[,1])), a[,1]), 
b = c(rep(NA, lenMax - length(b[,1])), b[,1]),
c = c(rep(NA, lenMax - length(c[,1])), c[,1]))

# a b c
# 1 NA 1.8374224 NA
# 2 NA 0.3436815 0.03719874
# 3 NA -1.3600955 -1.92311898
# 4 -0.6290858 0.5358145 0.41087971


Related Topics



Leave a reply



Submit