How to create lag variables
In base R the function lag()
is useful for time series objects. Here you have a dataframe and the situation is somewhat different.
You could try the following, which I admit is not very elegant:
df2$l1pm10 <- sapply(1:nrow(df2), function(x) df2$pm10[x+1])
df2$l1pm102 <- sapply(1:nrow(df2), function(x) df2$pm10[x-1])
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877
An alternative consists in using the Lag()
function (with capital "L") from the Hmisc
package:
library(Hmisc)
df2$l1pm10 <- Lag(df2$pm10, -1)
df2$l1pm102 <- Lag(df2$pm10, +1)
#> df2
# var1 pm10 l1pm10 l1pm102
#1 1 26.95607 NA NA
#2 2 NA 32.83869 26.95607
#3 3 32.83869 39.95607 NA
#4 4 39.95607 NA 32.83869
#5 5 NA 40.95607 39.95607
#6 6 40.95607 33.95607 NA
#7 7 33.95607 28.95607 40.95607
#8 8 28.95607 32.34877 33.95607
#9 9 32.34877 NA 28.95607
#10 10 NA NA 32.34877
How can I use a loop to create lag variables?
Good news: you need just one loop over 4 possibilities, as 0 3 12 18 and 0 90 360 540 are paired.
foreach x in 0 3 12 18 {
gen var`x' = date > (date2 + 30 * `x')
}
foreach
requires either in
or of
following the macro name, so your code fails at that point. There is also no construct foreach
... & foreach ....
: perhaps you are using syntax from elsewhere or just guessing there.
How to create a lag variable within each group?
You could do this within data.table
library(data.table)
data[, lag.value:=c(NA, value[-.N]), by=groups]
data
# time groups value lag.value
#1: 1 a 0.02779005 NA
#2: 2 a 0.88029938 0.02779005
#3: 3 a -1.69514201 0.88029938
#4: 1 b -1.27560288 NA
#5: 2 b -0.65976434 -1.27560288
#6: 3 b -1.37804943 -0.65976434
#7: 4 b 0.12041778 -1.37804943
For multiple columns:
nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
data
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132
Update
From data.table
versions >= v1.9.5
, we can use shift
with type
as lag
or lead
. By default, the type is lag
.
data[, (nm2) := shift(.SD), by=groups, .SDcols=nm1]
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132
If you need the reverse, use type=lead
nm3 <- paste("lead", nm1, sep=".")
Using the original dataset
data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
# time groups value value1 value2 lead.value lead.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884
#3: 3 b -0.8356286 -0.3053884 -0.01619026 NA NA
#4: 1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432
#5: 2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999
#7: 4 a 0.4874291 -2.2146999 0.91897737 NA NA
# lead.value2
#1: -0.04493361
#2: -0.01619026
#3: NA
#4: 0.82122120
#5: 0.59390132
#6: 0.91897737
#7: NA
data
set.seed(1)
data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
Create lagged variables for consecutive time points only using R
You could use ifelse
, testing whether diff(time)
is equal to 1. If so, write the lag. If not, write an NA
.
base %>%
group_by(id) %>%
mutate(lag1_x = ifelse(c(0, diff(time)) == 1, lag(x, n = 1, default = NA), NA)) %>%
as.data.frame()
#> id time x lag1_x
#> 1 1 1 1.852343 NA
#> 2 1 2 2.710538 1.852343
#> 3 1 3 2.700785 2.710538
#> 4 1 4 2.588489 2.700785
#> 5 1 7 3.252223 NA
#> 6 1 8 2.108079 3.252223
#> 7 1 10 3.435683 NA
#> 8 2 3 1.762462 NA
#> 9 2 4 2.775732 1.762462
#> 10 2 6 3.377396 NA
#> 11 2 9 3.133336 NA
#> 12 2 10 3.804190 3.133336
#> 13 2 11 2.942893 3.804190
#> 14 2 14 3.503608 NA
R create lag variable in unbalance time series dataframe
We can use an ifelse
statement. So, if the rows are 1 day apart (here I convert to a time format using lubridate
), then return lag
value
and if not return NA
.
library(tidyverse)
library(lubridate)
df %>%
arrange(ID, TIME) %>%
group_by(ID) %>%
mutate(LAG = ifelse(ymd(TIME) - lag(ymd(TIME)) == 1, lag(VALUE), NA))
Output
ID TIME VALUE LAG
<int> <int> <dbl> <dbl>
1 1 20110127 3.23 NA
2 1 20110128 4.57 3.23
3 1 20110130 3.22 NA
4 1 20110131 4.33 3.22
5 1 20110201 1.22 4.33
6 1 20110202 5.23 1.22
7 2 20110127 0.86 NA
8 2 20110128 7.55 0.86
9 2 20110129 1.24 7.55
10 2 20110201 3.69 NA
Data
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
TIME = c(20110127L, 20110128L, 20110130L, 20110131L, 20110201L,
20110202L, 20110127L, 20110128L, 20110129L, 20110201L), VALUE = c(3.23,
4.57, 3.22, 4.33, 1.22, 5.23, 0.86, 7.55, 1.24, 3.69)), class = "data.frame", row.names = c(NA,
-10L))
Related Topics
How to Install the R Package Rgl on Ubuntu 9.10, Using R Version 2.12.1
Why Are Lubridate Functions So Slow When Compared with As.Posixct
R: Replace Na with Item from Vector
Error in R Gbm Function When Cv.Folds > 0
Extract Time Series of a Point ( Lon, Lat) from Netcdf in R
How to Use Superscript with Ggplot2
Constrain Multiple Sliderinput in Shiny to Sum to 100
How to Use 'Assign()' or 'Get()' on Specific Named Column of a Dataframe
Plotting Functions on Top of Datapoints in R
How to Insert Pictures into Each Individual Bar in a Ggplot Graph
Different Font Faces and Sizes Within Label Text Entries in Ggplot2
Trycatch with Parlapply (Parallel Package) in R
Calculate Mean by Group Using Dplyr Package
Remove a Character from the Entire Data Frame