expand a data frame in R
Many options available to get the desired result. But perhaps OP seems to be keen on using tidyr::expand
. A solution can be as:
library(dplyr)
library(tidyr)
df %>% group_by(Var1, Var2) %>% expand(ID = 1:2) %>%
arrange(ID)
# # A tibble: 8 x 3
# # Groups: Var1, Var2 [4]
# Var1 Var2 ID
# <chr> <chr> <int>
# 1 a a 1
# 2 a b 1
# 3 b a 1
# 4 b b 1
# 5 a a 2
# 6 a b 2
# 7 b a 2
# 8 b b 2
Data:
df <- read.table(text =
"Var1 Var2
1 a a
2 a b
3 b a
4 b b",
header = TRUE, stringsAsFactors = FALSE)
How to expand data frame based on values?
One "non-tidyverse" way:
data.frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6),
stringsAsFactors = FALSE
) -> xdf
do.call(rbind.data.frame, lapply(1:nrow(xdf), function(i) {
data.frame(x = xdf$x[i], y=xdf$y[i], z=xdf$from[i]:xdf$to[i], stringsAsFactors=FALSE)
}))
One "tidyverse" way:
library(tidyverse)
data_frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6)
) -> xdf
rowwise(xdf) %>%
do(data_frame(x = .$x, y=.$y, z=.$from:.$to))
Another "tidyverse" way that has not been benchmarked below:
xdf %>%
rowwise() %>%
do( merge( as_tibble(.), tibble(z=.$from:.$to), by=NULL) ) %>%
select( -from, -to ) # Omit this line if you want to keep all original columns.
Since you asked abt performance:
library(microbenchmark)
data.table::data.table(
x = c('a','b','c'),
y = c(4,5,6),
from = c(1,2,3),
to = c(2,4,6)
) -> xdt1
data.frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6),
stringsAsFactors = FALSE
) -> xdf1
data.table
ops often modify in-place so keep a level playing field and make a copy of each data frame/table before doing the op.
That time penalty is ~100 nanoseconds on most modern systems.
microbenchmark(
data.table = {
xdt2 <- xdt1
xdt2[, diff:= (to - from) + 1]
xdt2 <- xdt2[rep(1:.N, diff)]
xdt2[,z := seq(from,to), by=.(x,y,from,to)]
xdt2[,c("x", "y", "z")]
},
base = {
xdf2 <- xdf1
do.call(rbind.data.frame, lapply(1:nrow(xdf2), function(i) {
data.frame(x = xdf2$x[i], y=xdf2$y[i], z=xdf2$from[i]:xdf2$to[i], stringsAsFactors=FALSE)
}))
},
tidyverse = {
xdf2 <- xdf1
dplyr::rowwise(xdf2) %>%
dplyr::do(dplyr::data_frame(x = .$x, y=.$y, z=.$from:.$to))
},
plyr = {
xdf2 <- xdf1
plyr::mdply(xdf2, function(x,y,from,to) data.frame(x,y,z=seq(from,to)))[c("x","y","z")]
},
times = 1000
)
## Unit: microseconds
## expr min lq mean median uq max neval
## data.table 920.361 1072.9265 1257.2321 1178.832 1280.2660 10628.552 1000
## base 677.069 761.3145 884.4136 825.472 915.8985 5366.515 1000
## tidyverse 15926.127 17231.5015 19201.4798 17994.919 20014.4140 166901.570 1000
## plyr 1938.838 2196.4205 2448.5314 2322.949 2501.5075 5735.255 1000
Expand data.frame by adding new column
One way with lapply
:
do.call(rbind, lapply(df$x, function(z) {
cbind(z, df2)
}))
# z y
#1 1871 1
#2 1871 2
#3 1871 3
#4 1872 1
#5 1872 2
#6 1872 3
lapply
iterates over df$x
and cbind
s the whole df2
to each element of df$x
. do.call
combines everything together in one data.frame.
Expand a data frame by group
use tidyr::pivot_wider
with names_glue
argument as follows.
- Store name of all variables (even 500) to be pivoted into a vector say
cols
- Use
values_from = all_of(cols)
as argument inpivot_wider
cols <- c('X1', 'X2', 'X5')
df %>% pivot_wider(id_cols = grp, names_from = X, values_from = all_of(cols),
names_glue = '{X}-{.value}')
# A tibble: 2 x 10
grp `1-X1` `2-X1` `5-X1` `1-X2` `2-X2` `5-X2` `1-X5` `2-X5` `5-X5`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2020_01_19 23 13 23 47 45 41 3 54 21
2 2020_01_20 65 39 43 32 52 76 19 12 90
If you want to use all columns except first two, use this
df %>% pivot_wider(id_cols = grp, names_from = X, values_from = !c(grp, X),
names_glue = '{X}-{.value}')
# A tibble: 2 x 10
grp `1-X1` `2-X1` `5-X1` `1-X2` `2-X2` `5-X2` `1-X5` `2-X5` `5-X5`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2020_01_19 23 13 23 47 45 41 3 54 21
2 2020_01_20 65 39 43 32 52 76 19 12 90
However, if you want to rearrange columns as shown in expected outcome, you may use names_vary = 'slowest'
in pivot_wider
function of tidyr 1.2.0.
Expand data frame and add a new variable
With tidyr
you can use expand
- this will expand your data frame to all combinations of values with your sequence of 1 to 3:
library(tidyverse)
df %>%
group_by(Location, year, group1, Value) %>%
expand(group2 = 1:3)
Output
Location year group1 Value group2
<fct> <dbl> <int> <fct> <int>
1 a 2020 1 x 1
2 a 2020 1 x 2
3 a 2020 1 x 3
4 a 2020 2 y 1
5 a 2020 2 y 2
6 a 2020 2 y 3
...
Your approach looks close, and I suppose you could just add on group2
like this:
cbind(df[rep(seq_len(nrow(df)), each = 3), ], group2 = 1:3)
Expand the Dataframe - Adding Rows not Columns
You can use tidyr::expand_grid()
library(tidyr)
expand_grid(KRS = fs$KRS,V2 = 1:6)
tidyr::expand()
:
fs %>%
tidyr::expand(KRS, V2 = 1:6)
Or even expand.grid()
from base R
expand.grid(fs$KRS, 1:6)
Expand and then fill a dataframe
Check out the fill()
function function through tidyverse
.
Using your example, but inducing the NA's you mention, df5
should be what you're looking for here.
library( tidyverse )
year <- c(2014, 2019, 2021)
price <- c(100, 110, 120)
df1 <- data.frame(cbind(id=1, year, price))
year <- c(2016, 2019, 2021)
price <- c(200, 210, 220)
df2 <- data.frame(cbind(id=2, year, price))
year <-c (2014, 2015, 2019, 2020)
price <-c (300, 310, 320, 330)
df3 <- data.frame(cbind(id=3, year, price))
list1 <- list(df1, df2, df3)
id <- c(rep(1,8), rep(2,8), rep(3,8))
year <- c(rep(seq(2014,2021), 3))
price <- c(100, NA, NA, NA, NA, 110, NA, 120,
NA, NA, 200, NA, NA, 210, 210, 220,
300, 310, 310, 310, 310, 320, 330, 330)
df4 <- data.frame(id, year, price)
df5 <- df4 %>% group_by( id ) %>% fill( price, .direction = "downup" )
Related Topics
How to Sum Data.Frame Column Values
Arranging Rows in Custom Order Using Dplyr
Convert String of Anyformat into Dd-Mm-Yy Hh:Mm:Ss in R
Convert Factor to Date Class for Multiple Columns
Significance Level Added to Matrix Correlation Heatmap Using Ggplot2
R Define Dimensions of Empty Data Frame
Write.Table Writes Unwanted Leading Empty Column to Header When Has Rownames
How to Use Random Forests in R with Missing Values
Split or Separate Uneven/Unequal Strings with No Delimiter
R How to Remove Rows in a Data Frame Based on the First Character of a Column
R Subtract Value for the Same Id (From the First Id That Shows)
Removing One Table from Another in R
Factor with Comma and Percentage to Numeric
Embedding Googlevis Charts into a Web Site
Given Start Date and End Date, Reshape/Expand Data for Each Day Between (Each Day on a Row)