How to Create a Lag Variable Within Each Group

How to create a lag variable within each group?

You could do this within data.table

 library(data.table)
data[, lag.value:=c(NA, value[-.N]), by=groups]
data
# time groups value lag.value
#1: 1 a 0.02779005 NA
#2: 2 a 0.88029938 0.02779005
#3: 3 a -1.69514201 0.88029938
#4: 1 b -1.27560288 NA
#5: 2 b -0.65976434 -1.27560288
#6: 3 b -1.37804943 -0.65976434
#7: 4 b 0.12041778 -1.37804943

For multiple columns:

nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
data
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132

Update

From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
# time groups value value1 value2 lag.value lag.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 NA NA
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247
#3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814
#4: 1 a 1.5952808 1.5117812 0.94383621 NA NA
#5: 2 a 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432
#7: 4 a 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406
# lag.value2
#1: NA
#2: 1.12493092
#3: -0.04493361
#4: NA
#5: 0.94383621
#6: 0.82122120
#7: 0.59390132

If you need the reverse, use type=lead

nm3 <- paste("lead", nm1, sep=".")

Using the original dataset

  data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
# time groups value value1 value2 lead.value lead.value1
#1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814
#2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884
#3: 3 b -0.8356286 -0.3053884 -0.01619026 NA NA
#4: 1 a 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432
#5: 2 a 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406
#6: 3 a -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999
#7: 4 a 0.4874291 -2.2146999 0.91897737 NA NA
# lead.value2
#1: -0.04493361
#2: -0.01619026
#3: NA
#4: 0.82122120
#5: 0.59390132
#6: 0.91897737
#7: NA

data

 set.seed(1)
data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
value = rnorm(7), value1=rnorm(7), value2=rnorm(7))

R: Group-by lag variable generating different within-group lag values

We can create a subset of data frame based on unique qy and id, create the lag columns value_t1 and value_t2, and then merge back to the original data frame.

library(dplyr)
library(data.table)
library(lubridate)

# Create example data frame
set.seed(123)

v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
id = c(rep(c('a','a','b','b','c'), 3)),
value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)

# Process the data
df2 <- df %>%
distinct(id, qy, .keep_all = TRUE) %>%
group_by(id) %>%
mutate(value_t1 = lag(value_t, n = 1L),
value_t2 = lag(value_t, n = 2L)) %>%
select(-value_t, -value2_t) %>%
ungroup() %>%
left_join(df, ., by = c("qy", "id"))

df2
# qy id value_t value2_t value_t1 value_t2
# 1 2016-01-01 a 0 29 NA NA
# 2 2016-01-01 a 0 79 NA NA
# 3 2016-04-01 a 1 5 0 NA
# 4 2016-04-01 a 1 50 0 NA
# 5 2016-10-01 a 0 87 1 0
# 6 2016-10-01 a 0 98 1 0
# 7 2016-01-01 b 1 41 NA NA
# 8 2016-01-01 b 1 86 NA NA
# 9 2016-04-01 b 0 83 1 NA
# 10 2016-04-01 b 0 51 1 NA
# 11 2016-10-01 b 1 60 0 1
# 12 2016-10-01 b 1 94 0 1
# 13 2016-01-01 c 0 91 NA NA
# 14 2016-04-01 c 0 42 0 NA
# 15 2016-10-01 c 1 9 0 0

Create lag variable with conditions and group by id

Is there a way to return for a fiscal year the last monthly stock price at previous fiscal year?

out = unique(dt[, .(id, fyear, fyear_start, fyear_end)])

out[, prc_end := {
dt[.(id = .SD$id, prc_month_end = .SD$fyear_start - 1L), on=.(id, prc_month_end), roll=TRUE, x.prc]
}]

id fyear fyear_start fyear_end prc_end
1: 59328 2001 2001-01-01 2001-12-31 NA
2: 59328 2002 2002-01-01 2002-12-31 31.45
3: 59328 2003 2003-01-01 2003-12-31 15.57
4: 61241 2001 2000-07-01 2001-06-30 NA
5: 61241 2002 2001-07-01 2002-06-30 15.86
6: 61241 2003 2002-07-01 2003-06-30 6.46

This is a rolling update join: For rows of table out

  • Construct the lookup vectors .(id, fyear_start - 1) using .SD = out, the subset of data
  • Lookup rows of dt, "rolling" the last vector, fyear_start - 1, to the nearest earlier date
  • Take matched values of x.prc, the prc column from dt

The notation x.* comes from the x[i] join/lookup syntax. For more details, see ?data.table.

lag multiple variables grouped by columns

Check with shift then concat

lag=[1,2]
df=pd.concat([df]+[df.groupby('grp')['var1','var2','var3'].shift(x).add_prefix('lag'+str(x)) for x in lag],axis=1)

R - Computations with lag variable by group

There are a couple of problems in your code. First, create your data.frame like this:

data.df <- data.frame(origin, dest, year, type, a, b)

This will retain the class of all the vectors. Note that if you don't want origin and dest to be factors, just use the argument stringsAsFactors = FALSE in the data.frame() function.

Next, create your new variable as follows:

data.df2 <- data.df %>%
group_by(origin, dest, type) %>%
arrange(year) %>%
mutate(new_var = (a - lag(a)) * b) %>%
ungroup()

Here, new_var is the variable that you want. You are right in that dplyr doesn't know that the lagged value is from the previous time period. Therefore, you have to use arrange(year).

How to get the lagged values of a variable based on groups with pandas?

Use Series.shift for next value, replace if matching original values and then repeat values by forward and back filling missing values:

s = toy_df['var'].shift()
toy_df['new'] = s.mask(toy_df['var'].eq(s)).ffill().bfill()
print (toy_df)
var new
0 1 1.0
1 1 1.0
2 1 1.0
3 2 1.0
4 2 1.0
5 3 2.0
6 1 3.0
7 1 3.0
8 2 1.0
9 4 2.0
10 4 2.0
11 4 2.0

If want convert values to integers:

s = toy_df['var'].shift()
toy_df['new'] = s.mask(toy_df['var'].eq(s)).ffill().bfill().astype(int)
print (toy_df)
var new
0 1 1
1 1 1
2 1 1
3 2 1
4 2 1
5 3 2
6 1 3
7 1 3
8 2 1
9 4 2
10 4 2
11 4 2

R Faster way to create lag by entire group using data.table

Here is another option:

dt[, val_lag := shift(val)[nafill(replace(seq.int(.N), rowid(rleid(val)) > 1L, NA_integer_), "locf")]]

timing code:

library(data.table)
set.seed(0L)
nr <- 1e6
ng <- 1e5
dt = data.table(ID=sample(ng, nr, TRUE), val=as.character(sample(nr, nr, TRUE)))
setorder(dt, ID, val)

microbenchmark::microbenchmark(times = 3L,
opt = dt[, val_lag := shift(val)[nafill(replace(seq.int(.N), rowid(rleid(val)) > 1L, NA_integer_), "locf")]],
rle = dt[, val_lag := with(rle(val), rep(c(NA, head(values, -1)), lengths)), by = ID]
)

timings:

Unit: milliseconds
expr min lq mean median uq max neval
opt 133.8857 159.8922 265.2029 185.8987 330.8614 475.8242 3
rle 3097.6005 3123.5422 3193.2654 3149.4839 3241.0978 3332.7117 3

edit: added an example of what is happening:

index         |    1    2    3    4    5    6    7    8    9   10
value | a a a b b c c c d d

shifted (s) | NA a a a b b c c c d
rowid+rleid | 1 2 3 1 2 1 2 3 1 2
replace | 1 NA NA 4 NA 6 NA NA 9 NA <In ?nafill, Only double and integer data types are currently supported. Hence, nafill the indices before accessing>
nafill | 1 1 1 4 4 6 6 6 9 9
using s above | s[1] s[1] s[1] s[4] s[4] s[6] s[6] s[6] s[9] s[9]


Related Topics



Leave a reply



Submit