How to Create Lag Variables

How to create lag variables

In base R the function lag() is useful for time series objects. Here you have a dataframe and the situation is somewhat different.

You could try the following, which I admit is not very elegant:

df2$l1pm10 <- sapply(1:nrow(df2), function(x) df2$pm10[x+1])
df2$l1pm102 <- sapply(1:nrow(df2), function(x) df2$pm10[x-1])
#> df2
#   var1     pm10   l1pm10  l1pm102
#1     1 26.95607       NA         
#2     2       NA 32.83869 26.95607
#3     3 32.83869 39.95607       NA
#4     4 39.95607       NA 32.83869
#5     5       NA 40.95607 39.95607
#6     6 40.95607 33.95607       NA
#7     7 33.95607 28.95607 40.95607
#8     8 28.95607 32.34877 33.95607
#9     9 32.34877       NA 28.95607
#10   10       NA       NA 32.34877

An alternative consists in using the Lag() function (with capital "L") from the Hmiscpackage:

library(Hmisc)
df2$l1pm10 <- Lag(df2$pm10, -1)
df2$l1pm102 <- Lag(df2$pm10, +1)
#> df2
#   var1     pm10   l1pm10  l1pm102
#1     1 26.95607       NA       NA
#2     2       NA 32.83869 26.95607
#3     3 32.83869 39.95607       NA
#4     4 39.95607       NA 32.83869
#5     5       NA 40.95607 39.95607
#6     6 40.95607 33.95607       NA
#7     7 33.95607 28.95607 40.95607
#8     8 28.95607 32.34877 33.95607
#9     9 32.34877       NA 28.95607
#10   10       NA       NA 32.34877

How can I use a loop to create lag variables?

Good news: you need just one loop over 4 possibilities, as 0 3 12 18 and 0 90 360 540 are paired.

foreach x in 0 3 12 18 { 
    gen var`x' = date > (date2 + 30 * `x') 
    
}

foreach requires either in or of following the macro name, so your code fails at that point. There is also no construct foreach ... & foreach ....: perhaps you are using syntax from elsewhere or just guessing there.

How to create a lag variable within each group?

You could do this within data.table

 library(data.table)
 data[, lag.value:=c(NA, value[-.N]), by=groups]
  data
 #   time groups       value   lag.value
 #1:    1      a  0.02779005          NA
 #2:    2      a  0.88029938  0.02779005
 #3:    3      a -1.69514201  0.88029938
 #4:    1      b -1.27560288          NA
 #5:    2      b -0.65976434 -1.27560288
 #6:    3      b -1.37804943 -0.65976434
 #7:    4      b  0.12041778 -1.37804943

For multiple columns:

nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
 data
#    time groups      value     value1      value2  lag.value lag.value1
#1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
#2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
#3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
#4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
#5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
#6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
#7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
#    lag.value2
#1:          NA
#2:  1.12493092
#3: -0.04493361
#4:          NA
#5:  0.94383621
#6:  0.82122120
#7:  0.59390132

Update

From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
#   time groups      value     value1      value2  lag.value lag.value1
#1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
#2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
#3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
#4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
#5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
#6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
#7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
#    lag.value2
#1:          NA
#2:  1.12493092
#3: -0.04493361
#4:          NA
#5:  0.94383621
#6:  0.82122120
#7:  0.59390132

If you need the reverse, use type=lead

nm3 <- paste("lead", nm1, sep=".")

Using the original dataset

  data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
  #  time groups      value     value1      value2 lead.value lead.value1
  #1:    1      b -0.6264538  0.7383247  1.12493092  0.1836433   0.5757814
  #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.8356286  -0.3053884
  #3:    3      b -0.8356286 -0.3053884 -0.01619026         NA          NA
  #4:    1      a  1.5952808  1.5117812  0.94383621  0.3295078   0.3898432
  #5:    2      a  0.3295078  0.3898432  0.82122120 -0.8204684  -0.6212406
  #6:    3      a -0.8204684 -0.6212406  0.59390132  0.4874291  -2.2146999
  #7:    4      a  0.4874291 -2.2146999  0.91897737         NA          NA
 #   lead.value2
 #1: -0.04493361
 #2: -0.01619026
 #3:          NA
 #4:  0.82122120
 #5:  0.59390132
 #6:  0.91897737
 #7:          NA

data

 set.seed(1)
 data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
             value = rnorm(7), value1=rnorm(7), value2=rnorm(7))

Create lagged variables for consecutive time points only using R

You could use ifelse, testing whether diff(time) is equal to 1. If so, write the lag. If not, write an NA.

base %>%
  group_by(id) %>%
  mutate(lag1_x = ifelse(c(0, diff(time)) == 1, lag(x, n = 1, default = NA), NA)) %>% 
  as.data.frame()
#>    id time        x   lag1_x
#> 1   1    1 1.852343       NA
#> 2   1    2 2.710538 1.852343
#> 3   1    3 2.700785 2.710538
#> 4   1    4 2.588489 2.700785
#> 5   1    7 3.252223       NA
#> 6   1    8 2.108079 3.252223
#> 7   1   10 3.435683       NA
#> 8   2    3 1.762462       NA
#> 9   2    4 2.775732 1.762462
#> 10  2    6 3.377396       NA
#> 11  2    9 3.133336       NA
#> 12  2   10 3.804190 3.133336
#> 13  2   11 2.942893 3.804190
#> 14  2   14 3.503608       NA

R create lag variable in unbalance time series dataframe

We can use an ifelse statement. So, if the rows are 1 day apart (here I convert to a time format using lubridate), then return lag value and if not return NA.

library(tidyverse)
library(lubridate)

df %>%
  arrange(ID, TIME) %>%
  group_by(ID) %>%
  mutate(LAG = ifelse(ymd(TIME) - lag(ymd(TIME)) == 1, lag(VALUE), NA))

Output

      ID     TIME VALUE   LAG
   <int>    <int> <dbl> <dbl>
 1     1 20110127  3.23 NA   
 2     1 20110128  4.57  3.23
 3     1 20110130  3.22 NA   
 4     1 20110131  4.33  3.22
 5     1 20110201  1.22  4.33
 6     1 20110202  5.23  1.22
 7     2 20110127  0.86 NA   
 8     2 20110128  7.55  0.86
 9     2 20110129  1.24  7.55
10     2 20110201  3.69 NA

Data

structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), 
    TIME = c(20110127L, 20110128L, 20110130L, 20110131L, 20110201L, 
    20110202L, 20110127L, 20110128L, 20110129L, 20110201L), VALUE = c(3.23, 
    4.57, 3.22, 4.33, 1.22, 5.23, 0.86, 7.55, 1.24, 3.69)), class = "data.frame", row.names = c(NA, 
-10L))

How to Create Lag Variables