rolling regression by group in the tidyverse?
Define a function Coef
whose argument is formed from cbind(y, x)
and which regresses y on x with an intercept, returning the coefficients. Then apply rollapplyr
using the current and prior rows over each group. If by last you meant the 2 prior rows to the current row, i.e. exclude the current row, then replace 2 with list(-seq(2))
as an argument to rollapplyr
.
Coef <- . %>% as.data.frame %>% lm %>% coef
mydata %>%
group_by(group) %>%
do(cbind(reg_col = select(., y, x) %>% rollapplyr(2, Coef, by.column = FALSE, fill = NA),
date_col = select(., date))) %>%
ungroup
giving:
# A tibble: 8 x 4
group `reg_col.(Intercept)` reg_col.x date
<chr> <dbl> <dbl> <date>
1 a NA NA 2016-06-01
2 a 0 0.500 2016-06-02
3 a 0 0.500 2016-06-03
4 a 0 0.500 2016-06-04
5 b NA NA 2016-06-03
6 b 0.00000000000000126 0.333 2016-06-04
7 b - 0.00000000000000251 0.333 2016-06-05
8 b 0 0.333 2016-06-06
Variation
A variation of the above would be:
mydata %>%
group_by(group) %>%
do(select(., date, y, x) %>%
read.zoo %>%
rollapplyr(2, Coef, by.column = FALSE, fill = NA) %>%
fortify.zoo(names = "date")
) %>%
ungroup
Slope Only
If only the slope is needed there are further simplifications possible. We use the fact that the slope equals cov(x, y) / var(x)
.
slope <- . %>% { cov(.[, 2], .[, 1]) / var(.[, 2])}
mydata %>%
group_by(group) %>%
mutate(slope = rollapplyr(cbind(y, x), 2, slope, by.column = FALSE, fill = NA)) %>%
ungroup
Multiple linear regression by group in a rolling window in R
Use group_modify and use rollapplyr with the by.column = FALSE argument so that rsd is applied to all columns at once rather than one at a time.
Note that if you use width 3 with two predictors and an intercept the residuals will necessarily be all zero so we changed the width to 5.
library(dplyr, exclude = c("lag", "filter"))
library(zoo)
width <- 5
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup
Rolling Window Regression by group in R (with dates)
You can create multiple linear models for a given interval of dates like this:
library(tidyverse)
# example data
set.seed(1337)
n_dates <- 10
data <- tibble(
date = runif(100, min = 1, max = n_dates) %>% floor(),
x1 = runif(100)**2,
x2 = runif(100) * 2,
x3 = runif(100) + 2,
y = x1 + 2 * x2 + runif(100)
) %>%
arrange(date)
data
#> # A tibble: 100 × 5
#> date x1 x2 x3 y
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.754 0.700 2.21 2.79
#> 2 1 0.0230 1.97 2.70 4.89
#> 3 1 0.388 0.500 2.21 1.54
#> 4 1 0.225 0.135 2.87 0.849
#> 5 1 0.00000810 0.139 2.22 1.12
#> 6 1 0.255 0.893 2.21 2.25
#> 7 1 0.402 1.37 2.06 3.51
#> 8 1 0.00275 0.363 2.68 0.984
#> 9 2 0.238 1.68 2.53 3.98
#> 10 2 0.0309 1.47 2.05 3.69
#> # … with 90 more rows
# number of rows per day
data %>% count(date)
#> # A tibble: 9 × 2
#> date n
#> <dbl> <int>
#> 1 1 8
#> 2 2 10
#> 3 3 15
#> 4 4 8
#> 5 5 10
#> 6 6 10
#> 7 7 12
#> 8 8 7
#> 9 9 20
# size of rolling window in days
window_size <- 3
models <- tibble(
from = seq(n_dates),
to = from + window_size - 1
) %>%
mutate(
data = from %>% map2(to, ~ data %>% filter(date >= .x & date <= .y)),
model = data %>% map(possibly(~ lm(y ~ x1 + x2 + x3, data = .x), NA))
)
models
#> # A tibble: 10 × 4
#> from to data model
#> <int> <dbl> <list> <list>
#> 1 1 3 <tibble [33 × 5]> <lm>
#> 2 2 4 <tibble [33 × 5]> <lm>
#> 3 3 5 <tibble [33 × 5]> <lm>
#> 4 4 6 <tibble [28 × 5]> <lm>
#> 5 5 7 <tibble [32 × 5]> <lm>
#> 6 6 8 <tibble [29 × 5]> <lm>
#> 7 7 9 <tibble [39 × 5]> <lm>
#> 8 8 10 <tibble [27 × 5]> <lm>
#> 9 9 11 <tibble [20 × 5]> <lm>
#> 10 10 12 <tibble [0 × 5]> <lgl [1]>
models %>%
filter(!is.na(model)) %>%
transmute(
from, to,
coeff = model %>% map(coefficients),
r2 = model %>% map_dbl(~ .x %>% summary() %>% pluck("r.squared"))
) %>%
unnest_wider(coeff)
# A tibble: 9 x 7
# from to `(Intercept)` x1 x2 x3 r2
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 3 0.601 0.883 2.07 -0.0788 0.970
#2 2 4 0.766 0.965 2.01 -0.141 0.965
#3 3 5 0.879 0.954 1.94 -0.165 0.953
Another way of subseting groups is to use nest
:
# get all observations from day 3 to 5
data %>% arrange(date) %>% nest(-date) %>% slice(3:5) %>% unnest()
rolling regression with confidence interval (tidyverse)
Try this:
library(dplyr)
library(zoo)
# use better example
set.seed(123)
mydata2 <- mydata %>% mutate(y = jitter(y))
stats <- function(x) {
fm <- lm(as.data.frame(x))
slope <- coef(fm)[[2]]
ci <- confint(fm)[2, ]
c(slope = slope, conf.lower = ci[[1]], conf.upper = ci[[2]])
}
roll <- function(x) rollapplyr(x, 3, stats, by.column = FALSE, fill = NA)
mydata2 %>%
group_by(group) %>%
do(cbind(., select(., y, x) %>% roll)) %>%
ungroup
giving:
# A tibble: 8 x 7
group y x date slope conf.lower conf.upper
<chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl>
1 a 0.915 2 2016-06-01 NA NA NA
2 a 2.12 4 2016-06-02 NA NA NA
3 a 2.96 6 2016-06-03 0.512 -0.133 1.16
4 a 4.15 8 2016-06-04 0.509 -0.117 1.14
5 b 2.18 6 2016-06-03 NA NA NA
6 b 2.82 9 2016-06-04 NA NA NA
7 b 4.01 12 2016-06-05 0.306 -0.368 0.980
8 b 5.16 15 2016-06-06 0.390 0.332 0.448
Update
Since this question first appeared dplyr got group_modify
which can be used to replace the do
. The ?group_modify
help page says: group_modify() is an evolution of do(), if you have used that before.
mydata2 %>%
group_by(group) %>%
group_modify(~ cbind(., select(., y, x) %>% roll)) %>%
ungroup
Rolling time-series regressions by group
If you want to run OLS regressions over groups, you can try to use the dplyr package. Using the do() function will run your models and store them in a variable called model.
library(dplyr)
library(magrittr)
## Fit models
fitted_model <- Final %>%
group_by(Ticker) %>%
do(model = lm(Returns ~ Market_ret, data = .))
To extract the coefficients check into the broom
package in R. Running tidy(model)
should do the trick.
Do a rolling regression only on specific dates
Since runner version 0.3.5
you can specify x = df
and do running regression on windows from data.frame
. Because you mutate df2
you have to subset relevant part of df
by x = df[df$Company.name == Company.name2,]
- you have to do the same with idx
.
running_regression_intercept <- function(x) {
coef(lm(stock_return ~ market_return, data = x))[1]
}
running_regression_slope <- function(x) {
coef(lm(stock_return ~ market_return, data = x))[2]
}
library(dplyr)
library(runner)
df2 %>%
group_by(Company.name2) %>%
mutate(
intercept = runner(
x = df[df$Company.name == Company.name2[1], ],
k = "180 days",
lag = "5 days",
idx = df$Date[df$Company.name == Company.name2[1]],
at = Event_date,
f = running_regression_intercept,
),
slope = runner(
x = df[df$Company.name == Company.name2[1], ],
k = "180 days",
lag = "5 days",
idx = df$Date[df$Company.name == Company.name2[1]],
at = Event_date,
f = running_regression_slope
)
)
# Company.name2 Event_date alpha beta
# <fct> <date> <dbl> <dbl>
# 1 AAPL 2017-01-12 0.0114 0.00488
# 2 AAPL 2017-07-31 -0.0654 0.00574
# 3 AAPL 2019-02-27 -0.0861 0.0310
# 4 AAPL 2018-09-06 0.0405 -0.0630
# 5 AAPL 2015-09-03 -0.121 -0.0246
# 6 AAPL 2018-11-20 -0.0283 -0.0254
# 7 AAPL 2015-07-03 -0.116 -0.0186
# 8 AAPL 2015-02-03 0.102 0.0409
# 9 AAPL 2017-03-16 -0.0157 0.0124
# 10 AAPL 2019-06-08 -0.00302 0.0532
I needed to modify your data a bit because format changed Event.Date from Date
to character
.
Date <- seq(from = as.Date("2014-01-01"),
to = as.Date("2019-12-31"),
by = 'day')
market_return <- c(rnorm(2191))
AAPL <- data.frame(
Company.name = "AAPL",
Date = Date,
market_return = market_return
)
MSFT <- data.frame(
Company.name = "MSFT",
Date = Date,
market_return = market_return
)
df <- rbind(AAPL, MSFT)
df$stock_return <- c(rnorm(4382))
df <- df[order(df$Date),]
df2 <- data.frame(
Company.name2 = c(replicate(450, "AAPL"), replicate(450, "MSFT")),
Event_date = sample(
seq(as.Date('2015/01/01'),
as.Date('2019/12/31'),
by="day"),
size = 900)
)
Related Topics
Delete Rows Based on Multiple Conditions with Dplyr
How to Ddply() Without Sorting
R How to Extract First Row of Each Matrix Within a List
S3 Method Consistency Warning When Building R Package with Roxygen
Plotting Functions on Top of Datapoints in R
Shade (Fill or Color) Area Under Density Curve by Quantile
Add Na Value to Ggplot Legend for Continuous Data Map
R Markdown - Format Text in Code Chunk with New Lines
Generating a Heatmap That Depicts the Clusters in a Dataset Using Hierarchical Clustering in R
Shiny R - Download the Result of a Table
Find *All* Duplicated Records in Data.Table (Not All-But-One)
Saving a Data Frame as a Binary File
Data.Table VS Plyr Regression Output
R: Creating a Map of Selected Canadian Provinces and U.S. States
References Truncated in Beamer Presentation Prepared in Knitr/Rmarkdown