Adding Multiple Lag Variables Using Dplyr and for Loops

R/dplyr: Using a loop to create lags and calculate cumulative sums based on column names

If I understand you correctly, the following should work:

Reproducible sample data (with 3 variables for summing):

set.seed(123)
df = data.frame(
id = c("a", "a", "a", "b", "b"),
date = seq(as.Date("2015-12-01"), as.Date("2015-12-05"), by="days"),
v1 = sample(seq(1, 20), 5),
v2 = sample(seq(1, 20), 5),
v3 = sample(seq(1, 20), 5)
)

> df
id date v1 v2 v3
1 a 2015-12-01 6 1 20
2 a 2015-12-02 15 11 9
3 a 2015-12-03 8 17 13
4 b 2015-12-04 16 10 10
5 b 2015-12-05 17 8 2

Group by id, sort by date (in case they aren't in sequence), & mutate for all named variables between the two named ones (v1:v3 in this case):

df %>%
group_by(id) %>%
arrange(date) %>%
mutate_at(vars(v1:v3), funs(Cum = cumsum(lag(., default = 0)))) %>%
ungroup()

# A tibble: 5 x 8
# Groups: id [2]
id date v1 v2 v3 v1_Cum v2_Cum v3_Cum
<fctr> <date> <int> <int> <int> <int> <int> <int>
1 a 2015-12-01 6 1 20 0 0 0
2 a 2015-12-02 15 11 9 6 1 20
3 a 2015-12-03 8 17 13 21 12 29
4 b 2015-12-04 16 10 10 0 0 0
5 b 2015-12-05 17 8 2 16 10 10

Looping all Variables in Data.table to create n-lags

There is a much simpler way to create the additional lag columns. The n parameter to data-table's shift() function is defined as

Non-negative integer vector denoting the offset to lead or lag the
input by. To create multiple lead/lag vectors, provide multiple values
to n

So,

DT[, shift(baz, 0:3)]

returns

    V1 V2 V3 V4
1: 3 NA NA NA
2: 6 3 NA NA
3: 9 6 3 NA
4: 12 9 6 3
5: 15 12 9 6
6: 18 15 12 9
7: 21 18 15 12
8: 24 21 18 15
9: 27 24 21 18
10: 30 27 24 21

Now, the OP has requested to shift each variable and to name the new columns according to the amount of shift. This can be accomplished by

DT[, unlist(lapply(.SD, shift, n = 0:3), recursive = FALSE)]
    foo1 foo2 foo3 foo4 bar1 bar2 bar3 bar4 baz1 baz2 baz3 baz4
1: 1 NA NA NA 2 NA NA NA 3 NA NA NA
2: 2 1 NA NA 4 2 NA NA 6 3 NA NA
3: 3 2 1 NA 6 4 2 NA 9 6 3 NA
4: 4 3 2 1 8 6 4 2 12 9 6 3
5: 5 4 3 2 10 8 6 4 15 12 9 6
6: 6 5 4 3 12 10 8 6 18 15 12 9
7: 7 6 5 4 14 12 10 8 21 18 15 12
8: 8 7 6 5 16 14 12 10 24 21 18 15
9: 9 8 7 6 18 16 14 12 27 24 21 18
10: 10 9 8 7 20 18 16 14 30 27 24 21

Data

For comparison, the sample data of Matt's answer is used

library(data.table)
DT <- data.table(foo = seq_len(10),
bar = seq_len(10)*2L,
baz = seq_len(10)*3L)

How to loop lapply to create LAG terms over multiple variables in R

data.table and Map to handle the looping:

vars <- c("b","c")
rpv <- rep(1:2, each=length(vars))
df[, paste(vars, "lag", rpv, sep="_") := Map(shift, .SD, rpv), by=a, .SDcols=vars]

# a b c b_lag_1 c_lag_1 b_lag_2 c_lag_2
#1: x 10.863180 393.9568 NA NA NA NA
#2: x 6.139258 537.9199 10.863180 393.9568 NA NA
#3: x 11.896448 483.8036 6.139258 537.9199 10.86318 393.9568
#4: y 18.079188 509.6136 NA NA NA NA
#5: y 5.463224 233.6991 18.079188 509.6136 NA NA
#6: y 6.363724 869.8406 5.463224 233.6991 18.07919 509.6136

Leads and Lags in for loop

The as.name needs eval to return the value of the column (assuming 'crisisdata' is data.table)

library(data.table)
for (var in crisis_variables){

# add lags
crisisdata[, (paste0("l",1:4, "_", var)):=
shift(eval(as.name(var)),1:4), by = country]

# add leads
crisisdata[, (paste0("f",0:4, "_", var)):=
shift(eval(as.name(var)),0:-4), by = country]
}

How to use a loop with mutate dplyr

I think this is the perfect case to use purrr::accumulate2().

purrr::accumulate() is often used to calculate conditional cumulative sums. It takes a function as the second argument. This function should have 2 arguments: the cumulative output co, and the currently evaluated value x.

purrr::accumulate2() allows us to use a second variable to iterate on, and here we use lag(check) as lx. The tricky part is that this second variable should be one item shorter, as it does not matter for the initial value.

Here is the code, matching your expected output.

library(tidyverse)

df = structure(list(id = c(8, 8, 8, 8, 8, 8, 8, 8, 8),
check = c(0, 1, 1, 0, 0, 1, 0, 0, 0),
count_x = c(0, 1, 2, 2, 2, 3, 3, 3, 3)),
row.names = c(NA, -9L), class = "data.frame")

df %>%
mutate(
count_y = accumulate2(check, lag(check)[-1], function(co, x, lx){
case_when(
x==0 ~ co,
x==1 & lx==0 ~ 1,
x==1 & lx==1 ~ co+1,
TRUE ~ 999 #error value in case of unexpected input
)
})
)
#> id check count_x count_y
#> 1 8 0 0 0
#> 2 8 1 1 1
#> 3 8 1 2 2
#> 4 8 0 2 2
#> 5 8 0 2 2
#> 6 8 1 3 1
#> 7 8 0 3 1
#> 8 8 0 3 1
#> 9 8 0 3 1

Created on 2021-05-05 by the reprex package (v2.0.0)

How to create a lag variable within each group?

You could do this within data.table

 library(data.table)
data[, lag.value:=c(NA, value[-.N]), by=groups]
data
# time groups value lag.value
#1: 1 a 0.02779005 NA
#2: 2 a 0.88029938 0.02779005
#3: 3 a -1.69514201 0.88029938
#4: 1 b -1.27560288 NA
#5: 2 b -0.65976434 -1.27560288
#6: 3 b -1.37804943 -0.65976434
#7: 4 b 0.12041778 -1.37804943

For multiple columns:

nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
data
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132

Update

From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132

If you need the reverse, use type=lead

nm3 <- paste("lead", nm1, sep=".")

Using the original dataset

  data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
# time groups value value1 value2 lead.value lead.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884
#3: 3 b -0.8356286 -0.3053884 -0.01619026 NA NA
#4: 1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432
#5: 2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999
#7: 4 a 0.4874291 -2.2146999 0.91897737 NA NA
# lead.value2
#1: -0.04493361
#2: -0.01619026
#3: NA
#4: 0.82122120
#5: 0.59390132
#6: 0.91897737
#7: NA

data

 set.seed(1)
data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
value = rnorm(7), value1=rnorm(7), value2=rnorm(7))

R: Create variable using iteratively updated values of previous row (=lag) similar to cumsum (depreciation)

You could use Reduce with accumulate = T:

Reduce(function(prev,value)  delta * prev + ifelse(is.na(value),0,value), x=df$value[-1], init = ifelse(is.na(df$value[1]),0,df$value[1]), accumulate = T)

[1] 1.000000 2.940000 2.763600 5.597784 9.261917

With data.table:

library(data.table)
setDT(df)

df[,output:=Reduce(function(prev,value) delta * prev + ifelse(is.na(value),0,value), x=value[-1], init = ifelse(is.na(value[1]),0,value[1]), accumulate = T)]



Related Topics



Leave a reply



Submit