Rolling Regression by Group in the Tidyverse

rolling regression by group in the tidyverse?

Define a function Coef whose argument is formed from cbind(y, x) and which regresses y on x with an intercept, returning the coefficients. Then apply rollapplyr using the current and prior rows over each group. If by last you meant the 2 prior rows to the current row, i.e. exclude the current row, then replace 2 with list(-seq(2)) as an argument to rollapplyr.

Coef <- . %>% as.data.frame %>% lm %>% coef

mydata %>% 
  group_by(group) %>% 
  do(cbind(reg_col = select(., y, x) %>% rollapplyr(2, Coef, by.column = FALSE, fill = NA),
           date_col = select(., date))) %>%
  ungroup

giving:

# A tibble: 8 x 4
  group `reg_col.(Intercept)` reg_col.x date      
  <chr>                 <dbl>     <dbl> <date>    
1 a      NA                      NA     2016-06-01
2 a       0                       0.500 2016-06-02
3 a       0                       0.500 2016-06-03
4 a       0                       0.500 2016-06-04
5 b      NA                      NA     2016-06-03
6 b       0.00000000000000126     0.333 2016-06-04
7 b     - 0.00000000000000251     0.333 2016-06-05
8 b       0                       0.333 2016-06-06

Variation

A variation of the above would be:

mydata %>% 
       group_by(group) %>% 
       do(select(., date, y, x) %>% 
          read.zoo %>% 
          rollapplyr(2, Coef, by.column = FALSE, fill = NA) %>%
          fortify.zoo(names = "date")
       ) %>% 
       ungroup

Slope Only

If only the slope is needed there are further simplifications possible. We use the fact that the slope equals cov(x, y) / var(x).

slope <- . %>% { cov(.[, 2], .[, 1]) / var(.[, 2])}
mydata %>%
       group_by(group) %>%
       mutate(slope = rollapplyr(cbind(y, x), 2, slope, by.column = FALSE, fill = NA)) %>%
       ungroup

Multiple linear regression by group in a rolling window in R

Use group_modify and use rollapplyr with the by.column = FALSE argument so that rsd is applied to all columns at once rather than one at a time.

Note that if you use width 3 with two predictors and an intercept the residuals will necessarily be all zero so we changed the width to 5.

library(dplyr, exclude = c("lag", "filter"))
library(zoo)

width <- 5

df %>% 
  group_by(Group) %>%
  group_modify(~ {
      cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
          by.column = FALSE))
  }) %>%
  ungroup

Rolling Window Regression by group in R (with dates)

You can create multiple linear models for a given interval of dates like this:

library(tidyverse)

# example data
set.seed(1337)
n_dates <- 10
data <- tibble(
  date = runif(100, min = 1, max = n_dates) %>% floor(),
  x1 = runif(100)**2,
  x2 = runif(100) * 2,
  x3 = runif(100) + 2,
  y = x1 + 2 * x2 + runif(100)
) %>%
  arrange(date)
data
#> # A tibble: 100 × 5
#>     date         x1    x2    x3     y
#>    <dbl>      <dbl> <dbl> <dbl> <dbl>
#>  1     1 0.754      0.700  2.21 2.79 
#>  2     1 0.0230     1.97   2.70 4.89 
#>  3     1 0.388      0.500  2.21 1.54 
#>  4     1 0.225      0.135  2.87 0.849
#>  5     1 0.00000810 0.139  2.22 1.12 
#>  6     1 0.255      0.893  2.21 2.25 
#>  7     1 0.402      1.37   2.06 3.51 
#>  8     1 0.00275    0.363  2.68 0.984
#>  9     2 0.238      1.68   2.53 3.98 
#> 10     2 0.0309     1.47   2.05 3.69 
#> # … with 90 more rows

# number of rows per day
data %>% count(date)
#> # A tibble: 9 × 2
#>    date     n
#>   <dbl> <int>
#> 1     1     8
#> 2     2    10
#> 3     3    15
#> 4     4     8
#> 5     5    10
#> 6     6    10
#> 7     7    12
#> 8     8     7
#> 9     9    20

# size of rolling window in days
window_size <- 3

models <- tibble(
  from = seq(n_dates),
  to = from + window_size - 1
) %>%
  mutate(
    data = from %>% map2(to, ~ data %>% filter(date >= .x & date <= .y)),
    model = data %>% map(possibly(~ lm(y ~ x1 + x2 + x3, data = .x), NA))
  )
models
#> # A tibble: 10 × 4
#>     from    to data              model    
#>    <int> <dbl> <list>            <list>   
#>  1     1     3 <tibble [33 × 5]> <lm>     
#>  2     2     4 <tibble [33 × 5]> <lm>     
#>  3     3     5 <tibble [33 × 5]> <lm>     
#>  4     4     6 <tibble [28 × 5]> <lm>     
#>  5     5     7 <tibble [32 × 5]> <lm>     
#>  6     6     8 <tibble [29 × 5]> <lm>     
#>  7     7     9 <tibble [39 × 5]> <lm>     
#>  8     8    10 <tibble [27 × 5]> <lm>     
#>  9     9    11 <tibble [20 × 5]> <lm>     
#> 10    10    12 <tibble [0 × 5]>  <lgl [1]>

models %>%
  filter(!is.na(model)) %>%
  transmute(
    from, to,
    coeff = model %>% map(coefficients),
    r2 = model %>% map_dbl(~ .x %>% summary() %>% pluck("r.squared"))
  ) %>%
  unnest_wider(coeff)

# A tibble: 9 x 7
#   from    to `(Intercept)`    x1    x2      x3    r2
#  <int> <dbl>         <dbl> <dbl> <dbl>   <dbl> <dbl>
#1     1     3         0.601 0.883  2.07 -0.0788 0.970
#2     2     4         0.766 0.965  2.01 -0.141  0.965
#3     3     5         0.879 0.954  1.94 -0.165  0.953

Another way of subseting groups is to use nest:

# get all observations from day 3 to 5
data %>% arrange(date) %>% nest(-date) %>% slice(3:5) %>% unnest()

rolling regression with confidence interval (tidyverse)

Try this:

library(dplyr)
library(zoo)

# use better example
set.seed(123)
mydata2 <- mydata %>% mutate(y = jitter(y))

stats <- function(x) {
  fm <- lm(as.data.frame(x))
  slope <- coef(fm)[[2]]
  ci <- confint(fm)[2, ]
  c(slope = slope, conf.lower = ci[[1]], conf.upper = ci[[2]])
}

roll <- function(x) rollapplyr(x, 3, stats, by.column = FALSE, fill = NA)
  
mydata2 %>%
  group_by(group) %>%
  do(cbind(., select(., y, x) %>% roll)) %>%
  ungroup

giving:

# A tibble: 8 x 7
  group     y     x date        slope conf.lower conf.upper
  <chr> <dbl> <dbl> <date>      <dbl>      <dbl>      <dbl>
1 a     0.915     2 2016-06-01 NA         NA         NA    
2 a     2.12      4 2016-06-02 NA         NA         NA    
3 a     2.96      6 2016-06-03  0.512     -0.133      1.16 
4 a     4.15      8 2016-06-04  0.509     -0.117      1.14 
5 b     2.18      6 2016-06-03 NA         NA         NA    
6 b     2.82      9 2016-06-04 NA         NA         NA    
7 b     4.01     12 2016-06-05  0.306     -0.368      0.980
8 b     5.16     15 2016-06-06  0.390      0.332      0.448

Update

Since this question first appeared dplyr got group_modify which can be used to replace the do. The ?group_modify help page says: group_modify() is an evolution of do(), if you have used that before.

mydata2 %>%
  group_by(group) %>%
  group_modify(~ cbind(., select(., y, x) %>% roll)) %>%
  ungroup

Rolling time-series regressions by group

If you want to run OLS regressions over groups, you can try to use the dplyr package. Using the do() function will run your models and store them in a variable called model.

library(dplyr)
library(magrittr)    
## Fit models
fitted_model <- Final %>%
    group_by(Ticker) %>%
    do(model = lm(Returns ~ Market_ret, data = .))

To extract the coefficients check into the broom package in R. Running tidy(model) should do the trick.

Do a rolling regression only on specific dates

Since runner version 0.3.5 you can specify x = df and do running regression on windows from data.frame. Because you mutate df2 you have to subset relevant part of df by x = df[df$Company.name == Company.name2,] - you have to do the same with idx.

running_regression_intercept <- function(x) {
  coef(lm(stock_return ~ market_return, data = x))[1]
}

running_regression_slope <- function(x) {
  coef(lm(stock_return ~ market_return, data = x))[2]
}

library(dplyr)
library(runner)
df2 %>%
  group_by(Company.name2) %>%
  mutate(
    intercept = runner(
      x = df[df$Company.name ==  Company.name2[1], ],
      k = "180 days",
      lag = "5 days",
      idx = df$Date[df$Company.name == Company.name2[1]],
      at = Event_date,
      f = running_regression_intercept,
    ),
    slope = runner(
      x = df[df$Company.name == Company.name2[1], ],
      k = "180 days",
      lag = "5 days",
      idx = df$Date[df$Company.name == Company.name2[1]],
      at = Event_date,
      f = running_regression_slope
    )
  )

# Company.name2 Event_date    alpha     beta
#     <fct>         <date>        <dbl>    <dbl>
#   1 AAPL          2017-01-12  0.0114   0.00488
#   2 AAPL          2017-07-31 -0.0654   0.00574
#   3 AAPL          2019-02-27 -0.0861   0.0310 
#   4 AAPL          2018-09-06  0.0405  -0.0630 
#   5 AAPL          2015-09-03 -0.121   -0.0246 
#   6 AAPL          2018-11-20 -0.0283  -0.0254 
#   7 AAPL          2015-07-03 -0.116   -0.0186 
#   8 AAPL          2015-02-03  0.102    0.0409 
#   9 AAPL          2017-03-16 -0.0157   0.0124 
#   10 AAPL          2019-06-08 -0.00302  0.0532

I needed to modify your data a bit because format changed Event.Date from Date to character.

Date <- seq(from = as.Date("2014-01-01"), 
            to = as.Date("2019-12-31"), 
            by = 'day')
market_return <- c(rnorm(2191))

AAPL <- data.frame(
  Company.name = "AAPL", 
  Date =  Date,
  market_return = market_return
)

MSFT <- data.frame(
  Company.name = "MSFT", 
  Date = Date,
  market_return = market_return
)

df <- rbind(AAPL, MSFT)
df$stock_return <- c(rnorm(4382))
df <- df[order(df$Date),]

df2 <- data.frame(
  Company.name2 = c(replicate(450, "AAPL"), replicate(450, "MSFT")), 
  Event_date = sample(
    seq(as.Date('2015/01/01'), 
        as.Date('2019/12/31'), 
        by="day"),
    size =  900)
  )

Rolling Regression by Group in the Tidyverse