Sliding Window Function in R

Sliding window of mean across dataframe, based on alternate column in R

Here are several alternatives. (1) has no package dependencies, (2) uses rollapply and so is the most similar to the code in the question and (3) uses SQL and is the shortest code-wise.

1) Base R If the problem is that dist does not contain every number between 1 and its maximum value then we can iterate over the intervals like this:

Fun <- function(st, width, df, fun) {
fun(subset(df, dist >= st & dist <= st + width - 1)$value)
}

width <- 50
step <- 25

starts <- seq(1, max(df$dist), step)

data.frame(starts,
ends = starts + width - 1,
mean = sapply(starts, Fun, width, df, mean),
N = sapply(starts, Fun, width, df, length))

giving:

  starts ends     mean  N
1 1 50 5.200910 50
2 26 75 4.710030 50
3 51 100 4.770270 50
4 76 125 4.880030 38
5 101 150 5.318415 25
6 126 175 5.575938 25
7 151 200 4.989383 25
8 176 225 3.918574 12

2) rollapply Another approach is to expand the input data frame in which case we can use rollapply.

library(zoo)

roll <- function(x, width, fun, step) {
fun2 <- function(x) fun(na.omit(x))
rollapply(x, width, by = step, fun2, partial = TRUE, align = "left")
}

width <- 50
step <- 25

m <- merge(df, data.frame(dist = 1:max(df$dist)), all = TRUE)
data.frame(starts,
ends = starts + width - 1,
mean = roll(m$value, width, mean, step),
N = roll(m$value, width, length, step)
)

giving:

  starts ends     mean  N
1 1 50 5.200910 50
2 26 75 4.710030 50
3 51 100 4.770270 50
4 76 125 4.880030 38
5 101 150 5.318415 25
6 126 175 5.575938 25
7 151 200 4.989383 25
8 176 225 3.918574 12

3) sqldf This can be formulated compactly using SQL with the indicated left join.

library(sqldf)

width <- 50
step <- 25

starts <- data.frame(starts = seq(1, max(df$dist), step))
fn$sqldf("select starts, starts+$width-1 ends, avg(value) mean, count(value) N
from starts
left join df on dist between starts and ends
group by starts.rowid")

giving:

  starts ends     mean  N
1 1 50 5.200910 50
2 26 75 4.710030 50
3 51 100 4.770270 50
4 76 125 4.880030 38
5 101 150 5.318415 25
6 126 175 5.575938 25
7 151 200 4.989383 25
8 176 225 3.918574 12

Note

For the input to be reproducible we must set the seed before using any random numbers so in the above we used this:

set.seed(123)
dist <- c(seq(1, 100, by = 1), seq(101, 200, by = 2))
value <- runif(150, min = 0, max = 10)
df <- data.frame(dist, value)

Sliding Windows for Rows in R

The following code implements a R function that receives as input a vector and an integer that represents a sliding window of arbitrary size. The function outputs a vector with the mean values of the elements inside every shunk of the window (you can adapt the code to do any other type of computation).

# Function that computes an sliding window based on the mean values
compute_mean_feat <- function(vector, window) {
mean <- c()
for (i in 1:length(vector)) {
if (i <= window) {
sum <- 0
for (k in 1:i) {
sum <- vector[k] + sum
}
mean <- c(mean, as.numeric(sum / i))
} else{
sum <- 0
for (j in (i - window + 1):i)
sum <- vector[j] + sum
}
mean <- c(mean, as.numeric(sum / i))
}
}
return(mean)
}

You can see the result of applying the compute_mean_feat function with a sliding window of 10 to a dummy vector.

# Manual example for testing
window <- 10
vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
compute_mean_feat(vector, window)

For a data frame, you just need to apply this function to the desired column and it will give you the sliding window vector for its rows.

R: Rolling window function with adjustable window and step-size for irregularly spaced observations

Here is an attempt with Rcpp. The function assumes that data is sorted according to time. More testing would be advisable and adjustments could be made.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector rollAverage(const NumericVector & times,
NumericVector & vals,
double start,
const double winlen,
const double winshift) {
int n = ceil((max(times) - start) / winshift);
NumericVector winvals;
NumericVector means(n);
int ind1(0), ind2(0);
for(int i=0; i < n; i++) {
if (times[0] < (start+winlen)) {
while((times[ind1] <= start) &
(times[ind1+1] <= (start+winlen)) &
(ind1 < (times.size() - 1))) {
ind1++;
}

while((times[ind2+1] <= (start+winlen)) & (ind2 < (times.size() - 1))) {
ind2++;
}

if (times[ind1] >= start) {
winvals = vals[seq(ind1, ind2)];
means[i] = mean(winvals);
} else {
means[i] = NA_REAL;
}
} else {
means[i] = NA_REAL;
}

start += winshift;
}

return means;
}

Testing it:

set.seed(42)
dat <- data.frame(time = seq(1:20)+runif(20,0,1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:19,2)] <- NA_real_

rollAverage(dat$time, dat$measure, -2.5, 5.0, 2.5)
#[1] 1.0222694 NA NA 1.0126639 0.9965048 0.9514456 1.0518228 NA NA NA

With your list of data.frames (using data.table):

set.seed(42)
dat <- data.frame(time = seq(1:50000)+runif(50000, 0.025, 1))
dat <- data.frame(dat, measure=c(diff(dat$time),NA_real_))
dat$measure[sample(1:50000,1000)] <- NA_real_
dat$measure[c(350:450,3000:3300, 20000:28100)] <- NA_real_
dat <- dat[-c(1000:2000, 30000:35000),]

# a list with a realistic number of observations:
dat <- lapply(1:300,function(x) dat)

library(data.table)
dat <- lapply(dat, setDT)
for (ind in seq_along(dat)) dat[[ind]][, i := ind]
#possibly there is a way to avoid these copies?

dat <- rbindlist(dat)

system.time(res <- dat[, rollAverage(time, measure, -2.5, 5.0, 2.5), by=i])
#user system elapsed
#1.51 0.02 1.54
print(res)
# i V1
# 1: 1 1.0217126
# 2: 1 0.9334415
# 3: 1 0.9609050
# 4: 1 1.0123473
# 5: 1 0.9965922
# ---
#6000596: 300 1.1121296
#6000597: 300 0.9984581
#6000598: 300 1.0093060
#6000599: 300 NA
#6000600: 300 NA

Sliding window in a data frame r

This code will slide p1 from 0 to 6990 in steps of 10 while p2 slides from 10 to 7000 in steps of 10:

output = apply(data.frame(seq(0,6990,10), seq(10,7000,10)), MARGIN=1,
function(x,y,z,a) roh_island(M1, 1, x[1], x[2]))
plot(output, col="blue")
grid(5, 5)

Sample Image

Generating sliding window to subset data for prediction task

The error/warning is from using == when the rhs is of length > 1. Use %in%

pred <- vector('list', 8)
names(pred) <- 2000:2007
for(i in 2000:2007){
df_sub1 <- subset(df, year %in% c(i, i+1, i+2))
mod <- glm(y~var1+var2, data=df_sub1, family=binomial())
df_sub2 <- subset(df, year == (i+3))
pred[[as.character(i)]] <- tryCatch(predict(mod,
newdata=df_sub2, type = "response"), error = function(e) NA_real_)
}

-output

> pred
$`2000`
4
1

$`2001`
5
1

$`2002`
6
1

$`2003`
7
2.220446e-16

$`2004`
8
0.1467543

$`2005`
9
0.001408577

$`2006`
10
2.220446e-16

$`2007`
[1] NA

Coding a simple loop for a sliding window

Use rollapplyr with the indicated function.

library(zoo)
pv <- function(xx) runs.test(xx, threshold = mean(xx))$p.value
out <- rollapplyr(x, 256, pv, fill = NA)

Note

library(randtests)
set.seed(123)
x <- ts(rnorm(2659, mean = 0.0001, sd = 0.0001))


Related Topics



Leave a reply



Submit