Adaptive Moving Average - Top Performance in R

Adaptive moving average - top performance in R

December 2018 update

Efficient implementation of adaptive rolling functions has been made in
data.table recently - more info in ?froll manual. Additionally an efficient alternative solution using base R has been identified (fastama below). Unfortunately Kevin Ushey's answer does not address the question thus it is not included in benchmark.
Scale of benchmark has been increased as it pointless to compare microseconds.

set.seed(108)
x = rnorm(1e6)
width = rep(seq(from = 100, to = 500, by = 5), length.out=length(x))
microbenchmark(
zoo=rollapplyr(x, width = width, FUN=mean, fill=NA),
mapply=base_mapply(x, width=width, FUN=mean, na.rm=T),
wmapply=wmapply(x, width=width, FUN=mean, na.rm=T),
ama=ama(x, width, na.rm=T),
fastama=fastama(x, width),
frollmean=frollmean(x, width, na.rm=T, adaptive=TRUE),
frollmean_exact=frollmean(x, width, na.rm=T, adaptive=TRUE, algo="exact"),
times=1L
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zoo 32371.938248 32371.938248 32371.938248 32371.938248 32371.938248 32371.938248 1
# mapply 13351.726032 13351.726032 13351.726032 13351.726032 13351.726032 13351.726032 1
# wmapply 15114.774972 15114.774972 15114.774972 15114.774972 15114.774972 15114.774972 1
# ama 9780.239091 9780.239091 9780.239091 9780.239091 9780.239091 9780.239091 1
# fastama 351.618042 351.618042 351.618042 351.618042 351.618042 351.618042 1
# frollmean 7.708054 7.708054 7.708054 7.708054 7.708054 7.708054 1
# frollmean_exact 194.115012 194.115012 194.115012 194.115012 194.115012 194.115012 1
ama = function(x, n, na.rm=FALSE, fill=NA, nf.rm=FALSE) {
# more or less the same as previous forloopply
stopifnot((nx<-length(x))==length(n))
if (nf.rm) x[!is.finite(x)] = NA_real_
ans = rep(NA_real_, nx)
for (i in seq_along(x)) {
ans[i] = if (i >= n[i])
mean(x[(i-n[i]+1):i], na.rm=na.rm)
else as.double(fill)
}
ans
}
fastama = function(x, n, na.rm, fill=NA) {
if (!missing(na.rm)) stop("fast adaptive moving average implemented in R does not handle NAs, input having NAs will result in incorrect answer so not even try to compare to it")
# fast implementation of adaptive moving average in R, in case of NAs incorrect answer
stopifnot((nx<-length(x))==length(n))
cs = cumsum(x)
ans = rep(NA_real_, nx)
for (i in seq_along(cs)) {
ans[i] = if (i == n[i])
cs[i]/n[i]
else if (i > n[i])
(cs[i]-cs[i-n[i]])/n[i]
else as.double(fill)
}
ans
}

Old answer:

I chose 4 available solutions which doesn't need to do to C++, quite easy to find or google.

# 1. rollapply
library(zoo)
?rollapplyr
# 2. mapply
base_mapply <- function(x, width, FUN, ...){
FUN <- match.fun(FUN)
f <- function(i, width, data){
if(i < width) return(NA_real_)
return(FUN(data[(i-(width-1)):i], ...))
}
mapply(FUN = f,
seq_along(x), width,
MoreArgs = list(data = x))
}
# 3. wmapply - modified version of wapply found: https://rmazing.wordpress.com/2013/04/23/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
wmapply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
SEQ1 <- 1:length(x)
SEQ1[SEQ1 < width] <- NA_integer_
SEQ2 <- lapply(SEQ1, function(i) if(!is.na(i)) (i - (width[i]-1)):i)
OUT <- lapply(SEQ2, function(i) if(!is.null(i)) FUN(x[i], ...) else NA_real_)
return(base:::simplify2array(OUT, higher = TRUE))
}
# 4. forloopply - simple loop solution
forloopply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
OUT <- numeric()
for(i in 1:length(x)) {
if(i < width[i]) next
OUT[i] <- FUN(x[(i-(width[i]-1)):i], ...)
}
return(OUT)
}

Below are the timings for prod function. mean function might be already optimized inside rollapplyr. All results equal.

library(microbenchmark)
# 1a. length(x) = 1000, window = 5-20
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 59.690217 60.694364 61.979876 68.55698 153.60445 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 14.372537 14.694266 14.953234 16.00777 99.82199 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 9.384938 9.755893 9.872079 10.09932 84.82886 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 14.730428 15.062188 15.305059 15.76560 342.44173 100

# 1b. length(x) = 1000, window = 50-200
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 71.99894 74.19434 75.44112 86.44893 281.6237 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 15.67158 16.10320 16.39249 17.20346 103.6211 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 10.88882 11.54721 11.75229 12.19790 106.1170 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 15.70704 16.06983 16.40393 17.14210 108.5005 100

# 2a. length(x) = 10000, window = 5-20
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 753.87882 781.8789 809.7680 872.8405 1116.7021 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 148.54919 159.9986 231.5387 239.9183 339.7270 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 98.42682 105.2641 117.4923 183.4472 245.4577 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 533.95641 602.0652 646.7420 672.7483 922.3317 100

# 2b. length(x) = 10000, window = 50-200
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 912.5829 946.2971 1024.7245 1071.5599 1431.5289 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 171.3189 180.6014 260.8817 269.5672 344.4500 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 123.1964 131.1663 204.6064 221.1004 484.3636 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 561.2993 696.5583 800.9197 959.6298 1273.5350 100

Moving average with varying window size

zoo::rollapply can be useful here. Try:

zoo::rollapply(toSmoothed, dynamicRange, FUN = mean, fill = NA, align = "left")
[1] 1.0 2.5 3.0 1.5 1.0 2.5 3.0 NA

Constructing moving average over a categorical variable in R

You could try zoo::rollmean

df <- structure(list(Group = c("exceeded", "exceeded", "exceeded", 
"exceeded", "exceeded", "exceeded", "exceeded", "exceeded", "exceeded",
"exceeded", "exceeded", "exceeded", "othergroup", "othergroup",
"othergroup", "othergroup", "othergroup", "othergroup", "othergroup",
"othergroup", "othergroup", "othergroup", "othergroup", "othergroup"
), Days = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), DV = c(2859L,
2948L, 4412L, 5074L, 5098L, 5147L, 4459L, 4730L, 4643L, 4698L,
4818L, 4521L, 2859L, 2948L, 4412L, 5074L, 5098L, 5147L, 4459L,
4730L, 4643L, 4698L, 4818L, 4521L), X5DayMA = c(NA, NA, NA, NA,
4078L, 4536L, 4838L, 4902L, 4815L, 4735L, 4670L, 4682L, NA, NA,
NA, NA, 4078L, 4536L, 4838L, 4902L, 4815L, 4735L, 4670L, 4682L
)), .Names = c("Group", "Days", "DV", "X5DayMA"), class = "data.frame", row.names = c(NA,
-24L))

head(df)
Group Days DV X5DayMA
1 exceeded 0 2859 NA
2 exceeded 1 2948 NA
3 exceeded 2 4412 NA
4 exceeded 3 5074 NA
5 exceeded 4 5098 4078
6 exceeded 5 5147 4536

library(plyr)
library(zoo)
ddply(
df, "Group",
transform,
5daymean = rollmean(DV, 5, align="right", na.pad=TRUE ))

Group Days DV X5DayMA 5daymean
1 exceeded 0 2859 NA NA
2 exceeded 1 2948 NA NA
3 exceeded 2 4412 NA NA
4 exceeded 3 5074 NA NA
5 exceeded 4 5098 4078 4078.2
6 exceeded 5 5147 4536 4535.8
7 exceeded 6 4459 4838 4838.0
8 exceeded 7 4730 4902 4901.6
9 exceeded 8 4643 4815 4815.4
10 exceeded 9 4698 4735 4735.4
11 exceeded 10 4818 4670 4669.6
12 exceeded 11 4521 4682 4682.0
13 othergroup 0 2859 NA NA
14 othergroup 1 2948 NA NA
15 othergroup 2 4412 NA NA
16 othergroup 3 5074 NA NA
17 othergroup 4 5098 4078 4078.2
18 othergroup 5 5147 4536 4535.8
19 othergroup 6 4459 4838 4838.0
20 othergroup 7 4730 4902 4901.6
21 othergroup 8 4643 4815 4815.4
22 othergroup 9 4698 4735 4735.4
23 othergroup 10 4818 4670 4669.6
24 othergroup 11 4521 4682 4682.0

or even faster with dplyr

library(dplyr)
df %.%
dplyr:::group_by(Group) %.%
dplyr:::mutate('5daymean' = rollmean(DV, 5, align="right", na.pad=TRUE ))

OR the super fast data.table

library(data.table)
dft <- data.table(df)
dft[ , `:=` ('5daymean' = rollmean(DV, 5, align="right", na.pad=TRUE )) , by=Group ]

Moving Average using R code

There are loads of ways of calculating moving averages.

in base r, this can work

filter(x, rep(1/2,2)) #this calculates moving average of 2 numbers in a sequence
filter(x, rep(1/3,3)) #this calculates moving average of 3 numbers in a sequence

for k consecutive observations

filter(x, rep(1/k,k))

e.g.

x <- c(3,5,7,3,4,2,6,4,7,2,1,9, 1, 10, 1,12)
filter(x, rep(1/2,2))
# [1] 4.0 6.0 5.0 3.5 3.0 4.0 5.0 5.5 4.5 1.5 5.0 5.0 5.5 5.5 6.5 NA

You should also look up the following packages: zoo and TTR packages for more options

Just as a quick example, the function runMean in TTR is super easy

runMean(x,2) #gives rolling mean of every 2 consecutive observations
runMean(x,k) #gives rolling mean of every k consecutive observations

How to calculate the average slope within a moving window in R

The comment seems to have been deleted but it was pointed out that the function in rollapply in the code in the question was not using the argument passed to it. After fixing that and making some other minor improvements, this returns the intercept and the slope in columns 1 and 2 respectively.

library(zoo)

Coef <- function(Z) coef(lm(y ~ t, as.data.frame(Z)))
rollapplyr(zoo(DataExample), 5, Coef, by.column = FALSE)

R data.table rolling max

Taking only the first element of the last instance of end_window seems to take care of the immediate problem, but the suggestion to use .EACHI is spot on. For a large data set, a join will be faster than a simple group operation.

ex[, end_window := pmin(end_window, .N)] # don't search beyond the extent of ex

# join solution
f1 <- function(ex) ex[ex, .(rollmax = value, row, end_window), on = .(row >= row, row <= end_window)][, .(rollmax = max(rollmax), end_window = end_window[1]), row]
# grouping operation
f2 <- function(ex) ex[, .(rollmax = max(ex$value[row:end_window]), end_window), row]
# join by .EACHI from jangorecki's suggestion
f3 <- function(ex) setnames(setcolorder(ex[ex, .(max(value)), on = .(row >= row, row <= end_window), .EACHI], c(1,3,2)), c("row", "rollmax", "end_window"))

res1 <- f1(ex)
res2 <- f2(ex)
res3 <- f3(ex)
f1(ex)[596]
#> row rollmax end_window
#> 1: 596 13328.67 622
f2(ex)[596]
#> row rollmax end_window
#> 1: 596 13328.67 622
f3(ex)[596]
#> row rollmax end_window
#> 1: 596 13328.67 622
max(ex$value[596:622])
#> [1] 13328.67

microbenchmark::microbenchmark(f1 = f1(ex),
f2 = f2(ex),
f3 = f3(ex),
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 7.5080 8.66995 9.886949 9.18510 9.73970 21.3667 100
#> f2 3.2476 3.70190 4.641003 4.19860 5.07995 14.8099 100
#> f3 3.9210 4.73330 5.323673 5.14675 5.73335 13.9769 100

Trying f2 and f3 on a larger data set. Surprisingly, the join is now faster.

ex <- data.table(value = cumsum(rnorm(1e6, 0.1)), end_window = 1:1e6 + sample(50:500, 1e6, TRUE), row = 1:1e6)[, end_window := pmin(end_window, .N)]
microbenchmark::microbenchmark(f2 = f2(ex),
f3 = f3(ex),
times = 10,
check = "identical")
#> Unit: seconds
#> expr min lq mean median uq max neval
#> f2 5.730260 5.802243 7.053432 7.080656 8.146056 8.395844 10
#> f3 2.607298 2.998779 3.302622 3.396466 3.695056 3.745173 10


Related Topics



Leave a reply



Submit