Expand Data Frame

expand a data frame in R

Many options available to get the desired result. But perhaps OP seems to be keen on using tidyr::expand. A solution can be as:

library(dplyr)
library(tidyr)

df %>% group_by(Var1, Var2) %>% expand(ID = 1:2) %>%
arrange(ID)

# # A tibble: 8 x 3
# # Groups: Var1, Var2 [4]
# Var1 Var2 ID
# <chr> <chr> <int>
# 1 a a 1
# 2 a b 1
# 3 b a 1
# 4 b b 1
# 5 a a 2
# 6 a b 2
# 7 b a 2
# 8 b b 2

Data:

df <- read.table(text = 
"Var1 Var2
1 a a
2 a b
3 b a
4 b b",
header = TRUE, stringsAsFactors = FALSE)

How to expand data frame based on values?

One "non-tidyverse" way:

data.frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6),
stringsAsFactors = FALSE
) -> xdf

do.call(rbind.data.frame, lapply(1:nrow(xdf), function(i) {
data.frame(x = xdf$x[i], y=xdf$y[i], z=xdf$from[i]:xdf$to[i], stringsAsFactors=FALSE)
}))

One "tidyverse" way:

library(tidyverse)

data_frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6)
) -> xdf

rowwise(xdf) %>%
do(data_frame(x = .$x, y=.$y, z=.$from:.$to))

Another "tidyverse" way that has not been benchmarked below:

xdf %>% 
rowwise() %>%
do( merge( as_tibble(.), tibble(z=.$from:.$to), by=NULL) ) %>%
select( -from, -to ) # Omit this line if you want to keep all original columns.

Since you asked abt performance:

library(microbenchmark)

data.table::data.table(
x = c('a','b','c'),
y = c(4,5,6),
from = c(1,2,3),
to = c(2,4,6)
) -> xdt1

data.frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6),
stringsAsFactors = FALSE
) -> xdf1

data.table ops often modify in-place so keep a level playing field and make a copy of each data frame/table before doing the op.

That time penalty is ~100 nanoseconds on most modern systems.

microbenchmark(

data.table = {
xdt2 <- xdt1
xdt2[, diff:= (to - from) + 1]
xdt2 <- xdt2[rep(1:.N, diff)]
xdt2[,z := seq(from,to), by=.(x,y,from,to)]
xdt2[,c("x", "y", "z")]
},

base = {
xdf2 <- xdf1
do.call(rbind.data.frame, lapply(1:nrow(xdf2), function(i) {
data.frame(x = xdf2$x[i], y=xdf2$y[i], z=xdf2$from[i]:xdf2$to[i], stringsAsFactors=FALSE)
}))
},

tidyverse = {
xdf2 <- xdf1
dplyr::rowwise(xdf2) %>%
dplyr::do(dplyr::data_frame(x = .$x, y=.$y, z=.$from:.$to))
},

plyr = {
xdf2 <- xdf1
plyr::mdply(xdf2, function(x,y,from,to) data.frame(x,y,z=seq(from,to)))[c("x","y","z")]
},

times = 1000

)
## Unit: microseconds
## expr min lq mean median uq max neval
## data.table 920.361 1072.9265 1257.2321 1178.832 1280.2660 10628.552 1000
## base 677.069 761.3145 884.4136 825.472 915.8985 5366.515 1000
## tidyverse 15926.127 17231.5015 19201.4798 17994.919 20014.4140 166901.570 1000
## plyr 1938.838 2196.4205 2448.5314 2322.949 2501.5075 5735.255 1000

Expand data.frame by adding new column

One way with lapply:

do.call(rbind, lapply(df$x, function(z) {
cbind(z, df2)
}))
# z y
#1 1871 1
#2 1871 2
#3 1871 3
#4 1872 1
#5 1872 2
#6 1872 3

lapply iterates over df$x and cbinds the whole df2 to each element of df$x. do.call combines everything together in one data.frame.

Expand a data frame by group

use tidyr::pivot_wider with names_glue argument as follows.

  • Store name of all variables (even 500) to be pivoted into a vector say cols
  • Use values_from = all_of(cols) as argument in pivot_wider
cols <- c('X1', 'X2', 'X5')
df %>% pivot_wider(id_cols = grp, names_from = X, values_from = all_of(cols),
names_glue = '{X}-{.value}')

# A tibble: 2 x 10
grp `1-X1` `2-X1` `5-X1` `1-X2` `2-X2` `5-X2` `1-X5` `2-X5` `5-X5`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2020_01_19 23 13 23 47 45 41 3 54 21
2 2020_01_20 65 39 43 32 52 76 19 12 90

If you want to use all columns except first two, use this

df %>% pivot_wider(id_cols = grp, names_from = X, values_from = !c(grp, X), 
names_glue = '{X}-{.value}')

# A tibble: 2 x 10
grp `1-X1` `2-X1` `5-X1` `1-X2` `2-X2` `5-X2` `1-X5` `2-X5` `5-X5`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2020_01_19 23 13 23 47 45 41 3 54 21
2 2020_01_20 65 39 43 32 52 76 19 12 90

However, if you want to rearrange columns as shown in expected outcome, you may use names_vary = 'slowest' in pivot_wider function of tidyr 1.2.0.

Expand data frame and add a new variable

With tidyr you can use expand - this will expand your data frame to all combinations of values with your sequence of 1 to 3:

library(tidyverse)

df %>%
group_by(Location, year, group1, Value) %>%
expand(group2 = 1:3)

Output

   Location  year group1 Value group2
<fct> <dbl> <int> <fct> <int>
1 a 2020 1 x 1
2 a 2020 1 x 2
3 a 2020 1 x 3
4 a 2020 2 y 1
5 a 2020 2 y 2
6 a 2020 2 y 3
...

Your approach looks close, and I suppose you could just add on group2 like this:

cbind(df[rep(seq_len(nrow(df)), each = 3), ], group2 = 1:3)

Expand the Dataframe - Adding Rows not Columns

You can use tidyr::expand_grid()

library(tidyr)

expand_grid(KRS = fs$KRS,V2 = 1:6)

tidyr::expand():

fs %>% 
tidyr::expand(KRS, V2 = 1:6)

Or even expand.grid() from base R

expand.grid(fs$KRS, 1:6)

Expand and then fill a dataframe

Check out the fill() function function through tidyverse.

Using your example, but inducing the NA's you mention, df5 should be what you're looking for here.

library( tidyverse )
year <- c(2014, 2019, 2021)
price <- c(100, 110, 120)
df1 <- data.frame(cbind(id=1, year, price))

year <- c(2016, 2019, 2021)
price <- c(200, 210, 220)
df2 <- data.frame(cbind(id=2, year, price))

year <-c (2014, 2015, 2019, 2020)
price <-c (300, 310, 320, 330)
df3 <- data.frame(cbind(id=3, year, price))

list1 <- list(df1, df2, df3)

id <- c(rep(1,8), rep(2,8), rep(3,8))
year <- c(rep(seq(2014,2021), 3))
price <- c(100, NA, NA, NA, NA, 110, NA, 120,
NA, NA, 200, NA, NA, 210, 210, 220,
300, 310, 310, 310, 310, 320, 330, 330)
df4 <- data.frame(id, year, price)
df5 <- df4 %>% group_by( id ) %>% fill( price, .direction = "downup" )


Related Topics



Leave a reply



Submit