How to Create Lag Variables

How to create lag variables

In base R the function lag() is useful for time series objects. Here you have a dataframe and the situation is somewhat different.

You could try the following, which I admit is not very elegant:

df2$l1pm10 <- sapply(1:nrow(df2), function(x) df2$pm10[x+1])
df2$l1pm102 <- sapply(1:nrow(df2), function(x) df2$pm10[x-1])
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877

An alternative consists in using the Lag() function (with capital "L") from the Hmiscpackage:

library(Hmisc)
df2$l1pm10 <- Lag(df2$pm10, -1)
df2$l1pm102 <- Lag(df2$pm10, +1)
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877

How can I use a loop to create lag variables?

Good news: you need just one loop over 4 possibilities, as 0 3 12 18 and 0 90 360 540 are paired.

foreach x in 0 3 12 18 { 
gen var`x' = date > (date2 + 30 * `x')

}

foreach requires either in or of following the macro name, so your code fails at that point. There is also no construct foreach ... & foreach ....: perhaps you are using syntax from elsewhere or just guessing there.

How to create a lag variable within each group?

You could do this within data.table

 library(data.table)
data[, lag.value:=c(NA, value[-.N]), by=groups]
data
# time groups value lag.value
#1: 1 a 0.02779005 NA
#2: 2 a 0.88029938 0.02779005
#3: 3 a -1.69514201 0.88029938
#4: 1 b -1.27560288 NA
#5: 2 b -0.65976434 -1.27560288
#6: 3 b -1.37804943 -0.65976434
#7: 4 b 0.12041778 -1.37804943

For multiple columns:

nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
data
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132

Update

From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132

If you need the reverse, use type=lead

nm3 <- paste("lead", nm1, sep=".")

Using the original dataset

  data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
# time groups value value1 value2 lead.value lead.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884
#3: 3 b -0.8356286 -0.3053884 -0.01619026 NA NA
#4: 1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432
#5: 2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999
#7: 4 a 0.4874291 -2.2146999 0.91897737 NA NA
# lead.value2
#1: -0.04493361
#2: -0.01619026
#3: NA
#4: 0.82122120
#5: 0.59390132
#6: 0.91897737
#7: NA

data

 set.seed(1)
data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
value = rnorm(7), value1=rnorm(7), value2=rnorm(7))

Create lagged variables for consecutive time points only using R

You could use ifelse, testing whether diff(time) is equal to 1. If so, write the lag. If not, write an NA.

base %>%
group_by(id) %>%
mutate(lag1_x = ifelse(c(0, diff(time)) == 1, lag(x, n = 1, default = NA), NA)) %>%
as.data.frame()
#> id time x lag1_x
#> 1 1 1 1.852343 NA
#> 2 1 2 2.710538 1.852343
#> 3 1 3 2.700785 2.710538
#> 4 1 4 2.588489 2.700785
#> 5 1 7 3.252223 NA
#> 6 1 8 2.108079 3.252223
#> 7 1 10 3.435683 NA
#> 8 2 3 1.762462 NA
#> 9 2 4 2.775732 1.762462
#> 10 2 6 3.377396 NA
#> 11 2 9 3.133336 NA
#> 12 2 10 3.804190 3.133336
#> 13 2 11 2.942893 3.804190
#> 14 2 14 3.503608 NA

R create lag variable in unbalance time series dataframe

We can use an ifelse statement. So, if the rows are 1 day apart (here I convert to a time format using lubridate), then return lag value and if not return NA.

library(tidyverse)
library(lubridate)

df %>%
arrange(ID, TIME) %>%
group_by(ID) %>%
mutate(LAG = ifelse(ymd(TIME) - lag(ymd(TIME)) == 1, lag(VALUE), NA))

Output

      ID     TIME VALUE   LAG
<int> <int> <dbl> <dbl>
1 1 20110127 3.23 NA
2 1 20110128 4.57 3.23
3 1 20110130 3.22 NA
4 1 20110131 4.33 3.22
5 1 20110201 1.22 4.33
6 1 20110202 5.23 1.22
7 2 20110127 0.86 NA
8 2 20110128 7.55 0.86
9 2 20110129 1.24 7.55
10 2 20110201 3.69 NA

Data

structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), 
TIME = c(20110127L, 20110128L, 20110130L, 20110131L, 20110201L,
20110202L, 20110127L, 20110128L, 20110129L, 20110201L), VALUE = c(3.23,
4.57, 3.22, 4.33, 1.22, 5.23, 0.86, 7.55, 1.24, 3.69)), class = "data.frame", row.names = c(NA,
-10L))


Related Topics



Leave a reply



Submit