Rolling Mean (Moving Average) by Group/Id With Dplyr

Rolling mean (moving average) by group/id with dplyr

If you are not committed to to dplyr this should work:

get.mav <- function(bp,n=2){
  require(zoo)
  if(is.na(bp[1])) bp[1] <- mean(bp,na.rm=TRUE)
  bp <- na.locf(bp,na.rm=FALSE)
  if(length(bp)<n) return(bp)
  c(bp[1:(n-1)],rollapply(bp,width=n,mean,align="right"))  
}
test <- with(test,test[order(ID,YEAR_VISIT),])

test$BLOOD_PRESSURE_UPDATED <- 
  unlist(aggregate(BLOOD_PRESSURE~ID,test,get.mav,na.action=NULL,n=2)$BLOOD_PRESSURE)
test
#    ID AGE YEAR_VISIT BLOOD_PRESSURE TREATMENT BLOOD_PRESSURE_UPDATED
# 1   1  20       2000             NA         3               134.6667
# 2   1  21       2001            129         2               131.8333
# 3   1  22       2002            145         3               137.0000
# 4   1  22       2002            130         2               137.5000
# 5   2  23       2003             NA        NA               130.0000
# 6   2  30       2010            150         2               140.0000
# 7   2  31       2011            110         3               130.0000
# ...

This works for moving averages > 2 as well.

And here's a data.table solution, which is likely to be much faster if your dataset is large.

library(data.table)
setDT(test)     # converts test to a data.table in place
setkey(test,ID,YEAR_VISIT)
test[,BLOOD_PRESSURE_UPDATED:=as.numeric(get.mav(BLOOD_PRESSURE,2)),by=ID]
test
#    ID AGE YEAR_VISIT BLOOD_PRESSURE TREATMENT BLOOD_PRESSURE_UPDATED
#  1:  1  20       2000             NA         3               134.6667
#  2:  1  21       2001            129         2               131.8333
#  3:  1  22       2002            145         3               137.0000
#  4:  1  22       2002            130         2               137.5000
#  5:  2  23       2003             NA        NA               130.0000
#  6:  2  30       2010            150         2               140.0000
#  7:  2  31       2011            110         3               130.0000
# ...

Getting rolling average of multiple column by multiple condition, with dplyr and apply family

Do you want this? (mean_run from library(runner) used).

You can automate this process for as many variables you want. Just use their names in .cols argument of mutate(across...
To change rolling window size just change k in mean_run as per choice.

df %>% pivot_longer(!gmID, names_to = c("H_T", ".value"),
                    names_pattern = "(.+)\\.(.+)") %>%
  group_by(Team) %>%
  mutate(across(.cols = c(PTS, AST), 
                ~ runner::mean_run(x = ., k = 3, lag = 1), 
                .names = '{.col}_av')) %>%
  pivot_wider(id_cols = gmID, 
              names_from = H_T, 
              names_glue = "{H_T}_{.value}", 
              values_from = -c(gmID, H_T))

# A tibble: 20 x 11
    gmID H_Team A_Team H_PTS A_PTS H_AST A_AST H_PTS_av A_PTS_av H_AST_av A_AST_av
   <int> <chr>  <chr>  <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
 1     1 CLE    WAS       94    84    22    26     NA       NA       NA       NA  
 2     2 MIA    BOS      120   107    25    24     NA       NA       NA       NA  
 3     3 LAL    DAL       91    99    24    22     NA       NA       NA       NA  
 4     4 PHI    DEN       84    75    18    19     NA       NA       NA       NA  
 5     5 CLE    IND       88    90    18    22     94       NA       22       NA  
 6     6 DET    HOU       96   105    21    28     NA       NA       NA       NA  
 7     7 CHI    SAC       93    87    21    14     NA       NA       NA       NA  
 8     8 DAL    WAS       95    99    26    22     99       84       22       26  
 9     9 UTA    DAL      113    94    24    20     NA       97       NA       24  
10    10 PHO    CLE       85    87    16    19     NA       91       NA       20  
11    11 POR    LAL      116   106    19    21     NA       91       NA       24  
12    12 WAS    OKC       86    84    27    18     91.5     NA       24       NA  
13    13 ORL    DEN      102    89    24    22     NA       75       NA       19  
14    14 CHA    IND       90    89    18    19     NA       90       NA       22  
15    15 BOS    MIL       88    99    22    26    107       NA       24       NA  
16    16 CHI    CLE       86   115    23    34     93       89.7     21       19.7
17    17 ATL    HOU      102   109    23    22     NA      105       NA       28  
18    18 DAL    MIA      104    84    27    18     96      120       22.7     25  
19    19 CLE    UTA       88    86    23    19     96.7    113       23.7     24  
20    20 WAS    DEN      111    88    25    16     89.7     82       25       20.5

Rolling average indexed on multiple variables

I think zoo::rollmean works well here, and dplyr::group_by can handle as many index variables as you need:

library(dplyr)
mtcars %>%
  group_by(cyl, am, vs) %>%
  mutate(across(c(mpg,disp), list(rm = ~ zoo::rollmeanr(., 2, fill = NA))))
# # A tibble: 32 x 13
# # Groups:   cyl, am, vs [7]
#      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb mpg_rm disp_rm
#    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>
#  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4   NA       NA 
#  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4   21      160 
#  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1   NA       NA 
#  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1   NA       NA 
#  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2   NA       NA 
#  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1   19.8    242.
#  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4   16.5    360 
#  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2   NA       NA 
#  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2   23.6    144.
# 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4   18.6    196.
# # ... with 22 more rows

The fill=NA argument means that the first in each series has no history to average on, so it is NA. If you prefer the first in a series to be an average of itself, you can instead use partial=TRUE (using rollapplyr instead):

mtcars %>%
  group_by(cyl, am, vs) %>%
  mutate(across(c(mpg,disp), list(rm = ~ zoo::rollapplyr(., 2, FUN = mean, partial = TRUE))))
# # A tibble: 32 x 13
# # Groups:   cyl, am, vs [7]
#      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb mpg_rm disp_rm
#    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>
#  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4   21      160 
#  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4   21      160 
#  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1   22.8    108 
#  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1   21.4    258 
#  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2   18.7    360 
#  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1   19.8    242.
#  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4   16.5    360 
#  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2   24.4    147.
#  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2   23.6    144.
# 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4   18.6    196.
# # ... with 22 more rows

I've used the align="right" variants of zoo's functions, assuming that your moving average is historical and that time increases in subsequent rows. If these assumptions are not true, make sure you intentionally choose between the align-variants.

I used dplyr::across here to handle an arbitrary number of columns in one step: Since I used a named list of "tilde-functions", it took the name of each function and appended it to the name of each of the column names. You can break it out into individual mutate assignments if you prefer, for readability, maintainability, or if you need different sets of arguments for each column.

7 Day Moving Average per group - R

You didn't supply enough data to create a weekly rolling mean within the groups, but in principle it could work like this:

library(tidyverse)
library(zoo)

my_data <- my_data %>% 
  group_by(ID) %>% 
  mutate(roll_mean = rollmean(Count, 2, na.pad = T))

Using dplyr you group_by your ID variable, and then create a single new column with the rolling mean. You can plot this then with standard ggplot2-syntax:

ggplot(my_data, aes(Date, Count, group = 1)) +
  geom_line(colour = "blue") +
  geom_point(colour = "blue") +
  geom_point(aes(y = roll_mean), colour = "red") +
  facet_wrap(~ID)
#> Warning: Removed 3 rows containing missing values (geom_point).

Sample Image

Data

zzz <- "YYYYMM    Date         ID    Count
201401    01/01/2014   A     151
201401    01/01/2014   B     68
201401    01/01/2014   C     487
201401    02/01/2014   A     198
201401    02/01/2014   B     97
201401    02/01/2014   C     403"

my_data <- read_table(zzz)

Mutate function in dplyr not working with Rolling Means/ Moving Averages

You get this error because the length of rolling means/stds does not match the legth of Dispersion. Simply add k - 1 NAs at the beginnig of your means/stds vectors.

Below is a working example. You can modify this based on your needs.

my_function <- function(df, k) {
  df %>%
    mutate(
      rolling_mean = c(rep(NA, k - 1), rollmean(Dispersion, k)),
      rolling_std = c(rep(NA, k - 1), rollapply(Dispersion, width = k, FUN = sd))
    )
}

For example, you may want to add group_by to compute these values for each Identifier:

my_function <- function(df, k) {
  df %>%
    group_by(Identifier) %>%
    mutate(
      rolling_mean = c(rep(NA, k - 1), rollmean(Dispersion, k)),
      rolling_std = c(rep(NA, k - 1), rollapply(Dispersion, width = k, FUN = sd))
    )
}

Update following up @G. Grothendieck's comment:

It turns out the package zoo already has comprehensive features for NA handling, refactoring the above-given code as:

my_function <- function(df, k) {
  df %>%
    mutate(
      rolling_mean = rollmeanr(Dispersion, k, fill = NA),
      rolling_std = rollapplyr(Dispersion, width = k, FUN = sd, fill = NA)
    )
}

How to find rolling mean using means previously generated using R?

Because your desired fill value depends on any previously created fill values, I think the only reasonable approach is a trusty for loop:

df$out <- NA

for (i in 1:nrow(df)) {
  if (!is.na(df$receivables[i])) {
    df$out[i] <- df$receivables[i]
  } else {
    df$out[i] <- mean(df$out[(i-3):(i-1)], na.rm = T)
  }
}

   gvkey fyear receivables desired_output      out
1  10443  2005     543.000         543.00 543.0000
2  10443  2006     595.000         595.00 595.0000
3  10443  2007     757.000         757.00 757.0000
4  10443  2008          NA         631.67 631.6667
5  10443  2009          NA         661.22 661.2222
6  10443  2010          NA         683.30 683.2963
7  10443  2011          NA         658.73 658.7284
8  29206  2017     147.469         147.47 147.4690
9  29206  2018     161.422         161.42 161.4220
10 29206  2019     154.019         154.02 154.0190
11 29206  2020          NA         154.30 154.3033
12 29206  2021          NA         156.58 156.5814

How to calculate rolling mean for multiple columns at once with a groupby and select in dplyr, while ignoring the groupby columns

Data, as defined in the question, has no numeric columns. It is all factors. We fix the definition below. Then we use mutate_at to just apply rollapplyr to the non-grouping columns. So that we can use Data, we roll the sum over the prior 3 values rather than the prior 21. An alternative to the mutate_at line would be mutate_if(is.numeric, ~ rollapplyr(...same...)) .

library(dplyr)
library(zoo)

Data <- data.frame(v1, v2, v3, v4) # v1, v2, v3, v4 are from question  

Data %>%
  group_by(v1, v2) %>%
  mutate_at(vars(-group_cols()), 
    ~ rollapplyr(.x, list(-seq(3)), sum, na.rm = FALSE, partial = TRUE, fill = NA)) %>%
  ungroup

giving:

# A tibble: 15 x 4
   v1    v2       v3    v4
   <fct> <fct> <dbl> <dbl>
 1 a     2010     NA    NA
 2 a     2010      1     6
 3 a     2010      3    19
 4 a     2010      6    24
 5 a     2010      9    24
 6 a     2010      8    24
 7 a     2010      9    23
 8 a     2010     10    82
 9 b     2020     NA    NA
10 b     2020     13     1
11 b     2020     18     3
12 b     2020     24     6
13 b     2020     24     9
14 b     2020     23     8
15 b     2020     82     9

conditional rolling average in R

You want to process the PRIOR 7 points rather than the 7 points that end at the current point. To do that use a width of list(-(1:7)). That says to use offsets -1 through -7 when processing the data. See ?rollapply for more information on specifying the width argument.

This (1) more directly specifies the intention making it easier to comprehend than approaches which require ignoring the required offsets and then fixing it up later and (2) uses only the packages you are already using (3) expresses the solution compactly and (4) preserves your solution changing only one argument.

  dat[, mean.val:= if (.N > 6) 
        rollapply(value, list(-(1:7)), function(x) mean(tail(sort(x), 5)), fill = NA)  
        else mean(value)]

Rolling Mean By Group Dplyr/data.table

you could use map from the purrr package and apply it on 1:n():

df = df %>% 
  na.omit() %>% 
  group_by(ticker) %>% 
  mutate(avg10 = map_dbl(1:n(), ~mean(lag_close[(max(.x-9, 1)):.x], na.rm =T))

Of course you have to decide what should happen with the first 9 rows where there are fewer than 10 observations. In my solution the rows 1 to 9 contain the mean of the last 1 to 9 observations.

Rolling Mean (Moving Average) by Group/Id With Dplyr