R - Faster Way to Calculate Rolling Statistics Over a Variable Interval

Rcpp is a good approach if speed is your primary concern. I'll use the rolling mean statistic to explain by example.

Benchmarks: Rcpp versus R

x = sort(runif(25000,0,4*pi))
y = sin(x) + rnorm(length(x),0.5,0.5)
system.time( rollmean_r(x,y,xout=x,width=1.1) )   # ~60 seconds
system.time( rollmean_cpp(x,y,xout=x,width=1.1) ) # ~0.0007 seconds

Code for Rcpp and R function

cppFunction('
  NumericVector rollmean_cpp( NumericVector x, NumericVector y, 
                              NumericVector xout, double width) {
    double total=0;
    unsigned int n=x.size(), nout=xout.size(), i, ledge=0, redge=0;
    NumericVector out(nout);

    for( i=0; i<nout; i++ ) {
      while( x[ redge ] - xout[i] <= width && redge<n ) 
        total += y[redge++];
      while( xout[i] - x[ ledge ] > width && ledge<n ) 
        total -= y[ledge++];
      if( ledge==redge ) { out[i]=NAN; total=0; continue; }
      out[i] = total / (redge-ledge);
    }
    return out;
  }')

rollmean_r = function(x,y,xout,width) {
  out = numeric(length(xout))
  for( i in seq_along(xout) ) {
    window = x >= (xout[i]-width) & x <= (xout[i]+width)
    out[i] = .Internal(mean( y[window] ))
  }
  return(out)
}

Now for an explantion of rollmean_cpp. x and y are the data. xout is a vector of points at which the rolling statistic is requested. width is the width*2 of the rolling window. Note that the indeces for the ends of sliding window are stored in ledge and redge. These are essentially pointers to the respective elements in x and y. These indeces could be very beneficial for calling other C++ functions (e.g., median and the like) that take a vector and starting and ending indeces as input.

For those who want a "verbose" version of rollmean_cpp for debugging (lengthy):

cppFunction('
  NumericVector rollmean_cpp( NumericVector x, NumericVector y, 
                              NumericVector xout, double width) {

    double total=0, oldtotal=0;
    unsigned int n=x.size(), nout=xout.size(), i, ledge=0, redge=0;
    NumericVector out(nout);

    for( i=0; i<nout; i++ ) {
      Rcout << "Finding window "<< i << " for x=" << xout[i] << "..." << std::endl;
      total = 0;

      // numbers to push into window
      while( x[ redge ] - xout[i] <= width && redge<n ) {
        Rcout << "Adding (x,y) = (" << x[redge] << "," << y[redge] << ")" ;
        Rcout << "; edges=[" << ledge << "," << redge << "]" << std::endl;
        total += y[redge++];
      }

      // numbers to pop off window
      while( xout[i] - x[ ledge ] > width && ledge<n ) {
        Rcout << "Removing (x,y) = (" << x[ledge] << "," << y[ledge] << ")";
        Rcout << "; edges=[" << ledge+1 << "," << redge-1 << "]" << std::endl;
        total -= y[ledge++];
      }
      if(ledge==n) Rcout << " OVER ";
      if( ledge==redge ) {
       Rcout<<" NO DATA IN INTERVAL " << std::endl << std::endl;
       oldtotal=total=0; out[i]=NAN; continue;}

      Rcout << "For interval [" << xout[i]-width << "," <<
               xout[i]+width << "], all points in interval [" << x[ledge] <<
               ", " << x[redge-1] << "]" << std::endl ;
      Rcout << std::endl;

      out[i] = ( oldtotal + total ) / (redge-ledge);
      oldtotal=total+oldtotal;
    }
    return out;
  }')

x = c(1,2,3,6,90,91)
y = c(9,8,7,5.2,2,1)
xout = c(1,2,2,3,6,6.1,13,90,100)
a = rollmean_cpp(x,y,xout=xout,2)
# Finding window 0 for x=1...
# Adding (x,y) = (1,9); edges=[0,0]
# Adding (x,y) = (2,8); edges=[0,1]
# Adding (x,y) = (3,7); edges=[0,2]
# For interval [-1,3], all points in interval [1, 3]
# 
# Finding window 1 for x=2...
# For interval [0,4], all points in interval [1, 3]
# 
# Finding window 2 for x=2...
# For interval [0,4], all points in interval [1, 3]
# 
# Finding window 3 for x=3...
# For interval [1,5], all points in interval [1, 3]
# 
# Finding window 4 for x=6...
# Adding (x,y) = (6,5.2); edges=[0,3]
# Removing (x,y) = (1,9); edges=[1,3]
# Removing (x,y) = (2,8); edges=[2,3]
# Removing (x,y) = (3,7); edges=[3,3]
# For interval [4,8], all points in interval [6, 6]
# 
# Finding window 5 for x=6.1...
# For interval [4.1,8.1], all points in interval [6, 6]
# 
# Finding window 6 for x=13...
# Removing (x,y) = (6,5.2); edges=[4,3]
# NO DATA IN INTERVAL 
# 
# Finding window 7 for x=90...
# Adding (x,y) = (90,2); edges=[4,4]
# Adding (x,y) = (91,1); edges=[4,5]
# For interval [88,92], all points in interval [90, 91]
# 
# Finding window 8 for x=100...
# Removing (x,y) = (90,2); edges=[5,5]
# Removing (x,y) = (91,1); edges=[6,5]
# OVER  NO DATA IN INTERVAL 

print(a)
# [1] 8.0 8.0 8.0 8.0 5.2 5.2 NaN 1.5 NaN

R - Fast way to calculate rolling mean with varying width

First create a grouping variable g and then compute the rolling means. Note that rollsum is substantially faster than rollapply but does not support partial necessitating the workaround shown:

library(zoo) # rollsum

g <- with(df, cumsum(ave(time, id, FUN = function(x) c(1, diff(x) != 1))))
roll4 <- function(x) rollsum(c(0, 0, 0, x), 4) / pmin(4, seq_along(x)) 
transform(df, avg = ave(assets, g, FUN = roll4))

giving:

   time   id   name assets   avg
1    51 1234 BANK A   5000  5000
2    52 1234 BANK A   6000  5500
3    53 1234 BANK A   4000  5000
4    55 1234 BANK A   7000  7000
5    56 1234 BANK A   8000  7500
6    51 2345 BANK B  10000 10000
7    52 2345 BANK B  12000 11000
8    51 3456 BANK C  30000 30000
9    52 3456 BANK C  35000 32500
10   53 3456 BANK C  40000 35000

Using Rolling Average to Calculate over Window of Values

An option using non-equi join in data.table which also handles an ID:

library(data.table)
setDT(df)[, avgHR := 
    df[.(ID=ID, start=MS-15000, end=MS), on=.(ID, MS>=start, MS<=end),
        by=.EACHI, mean(HR)]$V1
]

output:

    ID    MS HR    avgHR
 1:  1 36148 84 84.00000
 2:  1 36753 84 84.00000
 3:  1 37364 84 84.00000
 4:  1 38062 84 84.00000
 5:  1 38737 84 84.00000
 6:  1 39580 96 86.00000
 7:  1 40029 84 85.71429
 8:  1 40387 84 85.50000
 9:  1 41208 96 86.66667
10:  1 42006 84 86.40000
11:  1 42796 84 86.18182
12:  1 43533 96 87.00000
13:  1 44274 84 86.76923
14:  1 44988 84 86.57143
15:  1 45696 96 87.20000
16:  1 46398 84 87.00000
17:  1 47079 84 86.82353
18:  1 47742 84 86.66667
19:  1 48429 84 86.52632
20:  1 49135 84 86.40000
21:  1 49861 84 86.28571
22:  1 50591 84 86.18182
23:  1 51324 84 86.18182
24:  1 52059 84 86.18182
    ID    MS HR    avgHR

data:

MS <- c(36148, 36753,37364,38062,38737,39580,40029,40387,41208,42006,42796, 43533,44274,44988,45696,46398,47079,47742,48429,49135,49861,50591,51324,52059)
HR <- c(84,84,84,84,84,96,84,84,96,84,84,96,84,84,96,84,84,84,84,84,84,84,84,84)

df <- data.frame(ID=1, MS, HR)

What is the fastest way in R to calculate rolling max with a variable rolling window size?

Managed to vectorize parts of it:

Original -

system.time({
  out <- vector(mode='numeric', length=NROW(a))
  for(i in seq(a)) {
    if (i-b[i]>=0) out[i] <- max(a[(i-b[i]+1):i])
    else out[i] <- NA
  }
})
## user  system elapsed 
## 0.64    0.00    0.64

Slightly vectorized -

system.time({
  nr <- NROW(a)
  out <- rep(NA,nr)
  m <- 1:nr - b + 1
  n <- (1:nr)[m>0]

  for(i in n)
    out[i] <- max(a[m[i]:i])
})
## user  system elapsed 
## 0.39    0.00    0.39

Vectorize loops when calculating rolling means with variable amounts of data

I used tidyverse and runner and have done it like this in a single piped syntax. Syntax explanation-

I first collected seven days (as per logic provided) DQL and MAX values into a list using runner.
Before doing that, I have converted DQL into an ordered factored variable, which will be used in last syntax.
Secondly, i used purrr::map to modify each list according to given conditions,
- Not less than six are to be counted
- If there is exactly one E in 7 values, that has not to be counted.
Finally I unnested the list using unnest_wider

library(runner)
daily_data %>% mutate(dyDQL = factor(dyDQL, levels = c("A", "B", "E"), ordered = T),
                      d = runner(x = data.frame(a = dyMax, b= dyDQL),
                                   k = "7 days",
                                   lag = 0,
                                   idx = date,
                                   f = function(x) list(x))) %>%
  mutate(d = map(d, ~ .x %>% group_by(b) %>%
                     mutate(c = n()) %>%
                     ungroup() %>%
                     filter(!n() < 6) %>%
                     filter(!(b == 'E' & c == 1 & n() == 7)) %>%
                     summarise(ma.max7 = ifelse(n() == 0, NA, mean(a)), ma.max7.DQL = max(b))
                   )
         ) %>%
  unnest_wider(d)

# A tibble: 15 x 7
   Monitoring.Location.ID date       dyMax dyMin dyDQL ma.max7 ma.max7.DQL
   <chr>                  <date>     <dbl> <dbl> <ord>   <dbl> <ord>      
 1 River 1                2018-07-01  24.2  22.5 A        NA   NA         
 2 River 1                2018-07-02  24.6  20.4 A        NA   NA         
 3 River 1                2018-07-03  24.8  20.1 A        NA   NA         
 4 River 1                2018-07-04  25.3  20.7 A        NA   NA         
 5 River 1                2018-07-05  25.5  20.9 A        NA   NA         
 6 River 1                2018-07-06  25.0  21.0 A        24.9 A          
 7 River 1                2018-07-07  24.8  20.7 A        24.9 A          
 8 River 1                2018-07-08  23.4  20.8 B        24.8 B          
 9 River 1                2018-07-09  22.7  18.9 E        24.8 B          
10 River 1                2018-07-10  22.3  18.2 A        24.4 B          
11 River 1                2018-07-12  22.9  19.0 A        23.5 E          
12 River 1                2018-07-13  24.0  19.5 A        23.4 E          
13 River 1                2018-07-14  24.5  19.9 A        23.3 E          
14 River 1                2018-07-15  25.1  20.6 A        23.6 E          
15 River 1                2018-07-19  24.9  20.7 A        NA   NA

R - Faster Way to Calculate Rolling Statistics Over a Variable Interval