How to Create a Lag Variable Within Each Group

How to create a lag variable within each group?

You could do this within data.table

 library(data.table)
 data[, lag.value:=c(NA, value[-.N]), by=groups]
  data
 #   time groups       value   lag.value
 #1:    1      a  0.02779005          NA
 #2:    2      a  0.88029938  0.02779005
 #3:    3      a -1.69514201  0.88029938
 #4:    1      b -1.27560288          NA
 #5:    2      b -0.65976434 -1.27560288
 #6:    3      b -1.37804943 -0.65976434
 #7:    4      b  0.12041778 -1.37804943

For multiple columns:

nm1 <- grep("^value", colnames(data), value=TRUE)
nm2 <- paste("lag", nm1, sep=".")
data[, (nm2):=lapply(.SD, function(x) c(NA, x[-.N])), by=groups, .SDcols=nm1]
 data
#    time groups      value     value1      value2  lag.value lag.value1
#1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
#2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
#3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
#4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
#5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
#6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
#7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
#    lag.value2
#1:          NA
#2:  1.12493092
#3: -0.04493361
#4:          NA
#5:  0.94383621
#6:  0.82122120
#7:  0.59390132

Update

From data.table versions >= v1.9.5, we can use shift with type as lag or lead. By default, the type is lag.

data[, (nm2) :=  shift(.SD), by=groups, .SDcols=nm1]
#   time groups      value     value1      value2  lag.value lag.value1
#1:    1      b -0.6264538  0.7383247  1.12493092         NA         NA
#2:    2      b  0.1836433  0.5757814 -0.04493361 -0.6264538  0.7383247
#3:    3      b -0.8356286 -0.3053884 -0.01619026  0.1836433  0.5757814
#4:    1      a  1.5952808  1.5117812  0.94383621         NA         NA
#5:    2      a  0.3295078  0.3898432  0.82122120  1.5952808  1.5117812
#6:    3      a -0.8204684 -0.6212406  0.59390132  0.3295078  0.3898432
#7:    4      a  0.4874291 -2.2146999  0.91897737 -0.8204684 -0.6212406
#    lag.value2
#1:          NA
#2:  1.12493092
#3: -0.04493361
#4:          NA
#5:  0.94383621
#6:  0.82122120
#7:  0.59390132

If you need the reverse, use type=lead

nm3 <- paste("lead", nm1, sep=".")

Using the original dataset

  data[, (nm3) := shift(.SD, type='lead'), by = groups, .SDcols=nm1]
  #  time groups      value     value1      value2 lead.value lead.value1
  #1:    1      b -0.6264538  0.7383247  1.12493092  0.1836433   0.5757814
  #2:    2      b  0.1836433  0.5757814 -0.04493361 -0.8356286  -0.3053884
  #3:    3      b -0.8356286 -0.3053884 -0.01619026         NA          NA
  #4:    1      a  1.5952808  1.5117812  0.94383621  0.3295078   0.3898432
  #5:    2      a  0.3295078  0.3898432  0.82122120 -0.8204684  -0.6212406
  #6:    3      a -0.8204684 -0.6212406  0.59390132  0.4874291  -2.2146999
  #7:    4      a  0.4874291 -2.2146999  0.91897737         NA          NA
 #   lead.value2
 #1: -0.04493361
 #2: -0.01619026
 #3:          NA
 #4:  0.82122120
 #5:  0.59390132
 #6:  0.91897737
 #7:          NA

data

 set.seed(1)
 data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))),
             value = rnorm(7), value1=rnorm(7), value2=rnorm(7))

R: Group-by lag variable generating different within-group lag values

We can create a subset of data frame based on unique qy and id, create the lag columns value_t1 and value_t2, and then merge back to the original data frame.

library(dplyr)
library(data.table)
library(lubridate)

# Create example data frame
set.seed(123)

v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
                 id = c(rep(c('a','a','b','b','c'), 3)),
                 value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
                 value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)

# Process the data
df2 <- df %>%
  distinct(id, qy, .keep_all = TRUE) %>%
  group_by(id) %>%
  mutate(value_t1 = lag(value_t, n = 1L),
         value_t2 = lag(value_t, n = 2L)) %>%
  select(-value_t, -value2_t) %>%
  ungroup() %>%
  left_join(df, ., by = c("qy", "id")) 

df2
#            qy id value_t value2_t value_t1 value_t2
# 1  2016-01-01  a       0       29       NA       NA
# 2  2016-01-01  a       0       79       NA       NA
# 3  2016-04-01  a       1        5        0       NA
# 4  2016-04-01  a       1       50        0       NA
# 5  2016-10-01  a       0       87        1        0
# 6  2016-10-01  a       0       98        1        0
# 7  2016-01-01  b       1       41       NA       NA
# 8  2016-01-01  b       1       86       NA       NA
# 9  2016-04-01  b       0       83        1       NA
# 10 2016-04-01  b       0       51        1       NA
# 11 2016-10-01  b       1       60        0        1
# 12 2016-10-01  b       1       94        0        1
# 13 2016-01-01  c       0       91       NA       NA
# 14 2016-04-01  c       0       42        0       NA
# 15 2016-10-01  c       1        9        0        0

Create lag variable with conditions and group by id

Is there a way to return for a fiscal year the last monthly stock price at previous fiscal year?

out = unique(dt[, .(id, fyear, fyear_start, fyear_end)])

out[, prc_end := {
  dt[.(id = .SD$id, prc_month_end = .SD$fyear_start - 1L), on=.(id, prc_month_end), roll=TRUE, x.prc]
}]

      id fyear fyear_start  fyear_end prc_end
1: 59328  2001  2001-01-01 2001-12-31      NA
2: 59328  2002  2002-01-01 2002-12-31   31.45
3: 59328  2003  2003-01-01 2003-12-31   15.57
4: 61241  2001  2000-07-01 2001-06-30      NA
5: 61241  2002  2001-07-01 2002-06-30   15.86
6: 61241  2003  2002-07-01 2003-06-30    6.46

This is a rolling update join: For rows of table out

Construct the lookup vectors .(id, fyear_start - 1) using .SD = out, the subset of data
Lookup rows of dt, "rolling" the last vector, fyear_start - 1, to the nearest earlier date
Take matched values of x.prc, the prc column from dt

The notation x.* comes from the x[i] join/lookup syntax. For more details, see ?data.table.

lag multiple variables grouped by columns

Check with shift then concat

lag=[1,2]
df=pd.concat([df]+[df.groupby('grp')['var1','var2','var3'].shift(x).add_prefix('lag'+str(x)) for x in lag],axis=1)

R - Computations with lag variable by group

There are a couple of problems in your code. First, create your data.frame like this:

data.df <- data.frame(origin, dest, year, type, a, b)

This will retain the class of all the vectors. Note that if you don't want origin and dest to be factors, just use the argument stringsAsFactors = FALSE in the data.frame() function.

Next, create your new variable as follows:

data.df2 <- data.df %>%
  group_by(origin, dest, type) %>%
    arrange(year) %>% 
    mutate(new_var = (a - lag(a)) * b) %>%
  ungroup()

Here, new_var is the variable that you want. You are right in that dplyr doesn't know that the lagged value is from the previous time period. Therefore, you have to use arrange(year).

How to get the lagged values of a variable based on groups with pandas?

Use Series.shift for next value, replace if matching original values and then repeat values by forward and back filling missing values:

s = toy_df['var'].shift()
toy_df['new'] = s.mask(toy_df['var'].eq(s)).ffill().bfill()
print (toy_df)
    var  new
0     1  1.0
1     1  1.0
2     1  1.0
3     2  1.0
4     2  1.0
5     3  2.0
6     1  3.0
7     1  3.0
8     2  1.0
9     4  2.0
10    4  2.0
11    4  2.0

If want convert values to integers:

s = toy_df['var'].shift()
toy_df['new'] = s.mask(toy_df['var'].eq(s)).ffill().bfill().astype(int)
print (toy_df)
 var  new
0     1    1
1     1    1
2     1    1
3     2    1
4     2    1
5     3    2
6     1    3
7     1    3
8     2    1
9     4    2
10    4    2
11    4    2

R Faster way to create lag by entire group using data.table

Here is another option:

dt[, val_lag := shift(val)[nafill(replace(seq.int(.N), rowid(rleid(val)) > 1L, NA_integer_), "locf")]]

timing code:

library(data.table)
set.seed(0L)
nr <- 1e6
ng <- 1e5
dt = data.table(ID=sample(ng, nr, TRUE), val=as.character(sample(nr, nr, TRUE)))
setorder(dt, ID, val)

microbenchmark::microbenchmark(times = 3L,
    opt = dt[, val_lag := shift(val)[nafill(replace(seq.int(.N), rowid(rleid(val)) > 1L, NA_integer_), "locf")]],
    rle = dt[, val_lag := with(rle(val), rep(c(NA, head(values, -1)), lengths)), by = ID]
)

timings:

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
  opt  133.8857  159.8922  265.2029  185.8987  330.8614  475.8242     3
  rle 3097.6005 3123.5422 3193.2654 3149.4839 3241.0978 3332.7117     3

edit: added an example of what is happening:

index         |    1    2    3    4    5    6    7    8    9   10
value         |    a    a    a    b    b    c    c    c    d    d

shifted (s)   |   NA    a    a    a    b    b    c    c    c    d
rowid+rleid   |    1    2    3    1    2    1    2    3    1    2
replace       |    1   NA   NA    4   NA    6   NA   NA    9   NA <In ?nafill, Only double and integer data types are currently supported. Hence, nafill the indices before accessing>
nafill        |    1    1    1    4    4    6    6    6    9    9
using s above | s[1] s[1] s[1] s[4] s[4] s[6] s[6] s[6] s[9] s[9]

How to Create a Lag Variable Within Each Group