Dplyr/R Cumulative Sum with Reset

R cumulative sum by condition with reset

Here's one way with ave:

ave(df$a, cumsum(c(F, diff(df$a) < 0)), FUN=seq_along) - 1
[1] 0 1 2 3 0 1 2 3 4 5 6 0 1 0

We can get a running count grouped by diff(df$a) < 0. Which are the positions in the vector that are less than their predecessors. We add c(F, ..) to account for the first position. The cumulative sum of that vector creates an index for grouping. The function ave can carry out a function on that index, we use seq_along for a running count. But since it starts at 1, we subtract by one ave(...) - 1 to start from zero.


A similar approach using dplyr:

library(dplyr)
df %>%
group_by(cumsum(c(FALSE, diff(a) < 0))) %>%
mutate(row_number() - 1)

dplyr / R cumulative sum with reset

I think you can use accumulate() here to help. And i've also made a wrapper function to use for different thresholds

sum_reset_at <- function(thresh) {
function(x) {
accumulate(x, ~if_else(.x>=thresh, .y, .x+.y))
}
}

tib %>% mutate(c = sum_reset_at(5)(a))
# t a c
# <dbl> <dbl> <dbl>
# 1 1 2 2
# 2 2 3 5
# 3 3 1 1
# 4 4 2 3
# 5 5 2 5
# 6 6 3 3
tib %>% mutate(c = sum_reset_at(4)(a))
# t a c
# <dbl> <dbl> <dbl>
# 1 1 2 2
# 2 2 3 5
# 3 3 1 1
# 4 4 2 3
# 5 5 2 5
# 6 6 3 3
tib %>% mutate(c = sum_reset_at(6)(a))
# t a c
# <dbl> <dbl> <dbl>
# 1 1 2 2
# 2 2 3 5
# 3 3 1 6
# 4 4 2 2
# 5 5 2 4
# 6 6 3 7

cumsum by participant and reset on 0 R

Does this work?

library(dplyr)
library(data.table)
df %>%
mutate(grp = rleid(Correct)) %>%
group_by(Participant, grp) %>%
mutate(Count = cumsum(Correct)) %>%
select(- grp)
# A tibble: 10 x 4
# Groups: Participant, grp [6]
grp Participant Correct Count
<int> <chr> <dbl> <dbl>
1 1 A 1 1
2 1 A 1 2
3 1 A 1 3
4 2 A 0 0
5 3 A 1 1
6 3 B 1 1
7 3 B 1 2
8 4 B 0 0
9 5 B 1 1
10 5 B 1 2

Toy data:

df <- data.frame(
Participant = c(rep("A", 5), rep("B", 5)),
Correct = c(1,1,1,0,1,1,1,0,1,1)
)

R cumulative sum using dplyr with reset


library(dplyr)
data_right %>%
group_by(state, p) %>%
mutate(grp = cumsum(c(TRUE, diff(as.integer(Year)) > 1))) %>%
group_by(state, p, grp) %>%
mutate(cy = row_number()) %>%
ungroup() %>%
select(-grp)
# # A tibble: 12 x 5
# state p Year Consecutive_Yrs cy
# <chr> <chr> <chr> <dbl> <int>
# 1 NY n 1973 1 1
# 2 NY n 1974 2 2
# 3 NY n 1977 1 1
# 4 NY n 1978 2 2
# 5 NY p 1988 1 1
# 6 NY p 1989 2 2
# 7 PA n 1991 1 1
# 8 PA n 1992 2 2
# 9 PA n 1993 3 3
# 10 PA p 1920 1 1
# 11 PA p 1929 1 1
# 12 PA p 1931 1 1

Assumes the data is already ordered by Year.


Data:

data_right <- data.table(state = c("NY", "NY", "NY", "NY", "NY","NY", "PA", "PA", "PA", "PA", "PA", "PA"), p = c("n", "n","n","n", "p", "p", "n", "n", "n", "p", "p", "p"),Year = c("1973", "1974", "1977", "1978", "1988", "1989" ,"1991", "1992", "1993", "1920", "1929", "1931"), Consecutive_Yrs = c(1,2,1,2,1,2,1,2,3,1,1,1))

Dpylr solution for cumsum with a factor reset

You can create a new group everytime OilChanged == 'Yes' and take cumsum of Diff value in each group.

library(dplyr)

df %>%
group_by(grp = lag(cumsum(OilChanged == 'Yes'), default = 0)) %>%
mutate(newcumsum = cumsum(Diff)) %>%
ungroup %>%
select(-grp)


# OilChanged Odometer Diff CumSum newcumsum
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 No 300 0 0 0
# 2 No 350 50 50 50
# 3 Yes 410 60 110 110
# 4 No 420 10 10 10
# 5 No 430 10 20 20
# 6 No 450 20 40 40
# 7 No 500 50 90 90
# 8 No 600 100 190 190
# 9 No 600 0 190 190
#10 No 600 0 190 190
#11 No 650 50 240 240
#12 Yes 660 10 250 250
#13 No 700 40 40 40

cumsum with a condition to restart in R

You may use cumsum to create groups as well.

library(dplyr)

df <- df %>%
group_by(group = cumsum(dplyr::lag(port == 0, default = 0))) %>%
mutate(cumsum_G = cumsum(G)) %>%
ungroup

df

# inv ass port G group cumsum_G
# <chr> <chr> <int> <int> <dbl> <int>
#1 i x 2 1 0 1
#2 i x 2 0 0 1
#3 i x 0 1 0 2
#4 i x 3 0 1 0
#5 i x 3 1 1 1

You may remove the group column from output using %>% select(-group).

data

df <- structure(list(inv = c("i", "i", "i", "i", "i"), ass = c("x", 
"x", "x", "x", "x"), port = c(2L, 2L, 0L, 3L, 3L), G = c(1L,
0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L))

R Cumulative Sum with a condition and a reset

This can be achieved by:

library(tidyverse)
library(data.table)

z %>%
group_by(rleid(Signal)) %>% #advance value every time Signal changes and group by that
mutate(cum = Signal*cumsum(Volume)) %>% #cumsum in each group
ungroup() %>% #ungroup so you could remove the grouping column
select(-4) #remove grouping column

or without data.table by using rle:

z %>%
mutate(rl = rep(1:length(rle(Signal)$length), times = rle(Signal)$length)) %>%
group_by(rl) %>%
mutate(cum = Signal*cumsum(Volume)) %>%
ungroup() %>%
select(-4)

#output
date Signal Volume cum

<fct> <int> <int> <int>
1 2016-01-04 NA 37912403 NA
2 2016-01-05 - 1 23258238 - 23258238
3 2016-01-06 - 1 25096183 - 48354421
4 2016-01-07 - 1 45172906 - 93527327
5 2016-01-08 - 1 35402298 -128929625
6 2016-01-11 - 1 29932385 -158862010
7 2016-01-12 - 1 28395390 -187257400
8 2016-01-13 - 1 33410553 -220667953
9 2016-01-14 - 1 48658623 -269326576
10 2016-01-15 1 46132781 46132781
11 2016-01-19 1 30998256 77131037
12 2016-01-20 - 1 59051429 - 59051429
13 2016-01-21 1 30518939 30518939
14 2016-01-22 1 30495387 61014326
15 2016-01-25 1 32482015 93496341
16 2016-01-26 - 1 26877080 - 26877080
17 2016-01-27 - 1 58699359 - 85576439
18 2016-01-28 1 107475327 107475327
19 2016-01-29 1 62739548 170214875
20 2016-02-01 1 46132726 216347601

data:

z <- read.table(text =      "date     Signal    Volume
2016-01-04 NA 37912403
2016-01-05 -1 23258238
2016-01-06 -1 25096183
2016-01-07 -1 45172906
2016-01-08 -1 35402298
2016-01-11 -1 29932385
2016-01-12 -1 28395390
2016-01-13 -1 33410553
2016-01-14 -1 48658623
2016-01-15 1 46132781
2016-01-19 1 30998256
2016-01-20 -1 59051429
2016-01-21 1 30518939
2016-01-22 1 30495387
2016-01-25 1 32482015
2016-01-26 -1 26877080
2016-01-27 -1 58699359
2016-01-28 1 107475327
2016-01-29 1 62739548
2016-02-01 1 46132726", header = T)

Cumulative sum with reset option if multiple conditions are met

Contributing with a base-R solution:

df$amount_cumsum <- 0
df$count_cumsum <- 0
df$condition_met <- 0
reset = F
for (i in 1:nrow(df)) {
if (i == 1 | reset) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else if (df$id[i] != df$id[i-1]) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else {
df$amount_cumsum[i] = df$amount_cumsum[i-1] + df$amount[i]
df$count_cumsum[i] = df$count_cumsum[i-1] + df$count[i]
}

if (df$amount_cumsum[i] >= 10 & df$count_cumsum[i] >= 3) {
df$condition_met[i] = 1
reset = T
}
}

I've expanded your dataset and benchmarked this code against your solution. Benchmark shows the Base-R solution 21 times faster than the tidyverse one!

library(tidyverse)

dates = seq(as.Date("2019-01-01"), as.Date("2020-03-04"), by="days")

df <- data.frame(
date = c(sample(dates, 300), sample(dates, 400), sample(dates, 350)),
id = c(rep("A", 300), rep("B", 400), rep("C", 350)),
amount = floor(runif(1050, 0, 15)),
count = floor(runif(1050, 0, 5)),
stringsAsFactors = F
)

rbenchmark::benchmark(
"Tidy Solution" = {
df_tidy <- df %>%
group_by(id) %>%
nest(data = c(amount, count)) %>%
mutate(
data_accumulate = purrr::accumulate(.x = data, .f = function(.x, .y) if (max(.x[1]) < 10 | max(.x[2]) < 3) .x + .y else .y)
) %>%
unnest(cols = c(data_accumulate)) %>%
rename(amount_cumsum = amount, count_cumsum = count) %>%
unnest(cols = c(data)) %>%
mutate(condition_met = case_when(
amount_cumsum >= 10 & count_cumsum >= 3 ~ 1,
TRUE ~ 0)
)
},
"Base-R Solution" = {
df_base <- df
df_base$amount_cumsum <- 0
df_base$count_cumsum <- 0
df_base$condition_met <- 0
reset = F # to reset the counters
for (i in 1:nrow(df_base)) {
if (i == 1 | reset) {
df_base$amount_cumsum[i] = df_base$amount[i]
df_base$count_cumsum[i] = df_base$count[i]
reset = F
} else if (df_base$id[i] != df_base$id[i-1]) {
df_base$amount_cumsum[i] = df_base$amount[i]
df_base$count_cumsum[i] = df_base$count[i]
reset = F
} else {
df_base$amount_cumsum[i] = df_base$amount_cumsum[i-1] + df_base$amount[i]
df_base$count_cumsum[i] = df_base$count_cumsum[i-1] + df_base$count[i]
}
if (df_base$amount_cumsum[i] >= 10 & df_base$count_cumsum[i] >= 3) {
df_base$condition_met[i] = 1
reset = T
}
}
},
replications = 100)

gc()
           test replications elapsed relative user.self sys.self user.child sys.child
Base-R Solution 100 3.89 1.000 3.69 0.0 NA NA
Tidy Solution 100 84.00 21.594 78.65 0.2 NA NA

Conditional running count (cumulative sum) with reset in R (dplyr)

We can use case_when to assign the value which we need based on our conditions. We then add an additional group_by condition using cumsum to switch values when the temp column 0. In the final mutate step we temporarily replace NA values in temp to 0, then take cumsum over it and put back the NA values again to it's place to get the final output.

library(dplyr)

mydata %>%
group_by(id, age) %>%
mutate(temp = case_when(accuracy == 0 & block == 2 & condition == 1 ~ 1,
accuracy == 1 & block == 2 & condition == 1 ~ 0,
TRUE ~ NA_real_)) %>%
ungroup() %>%
group_by(id, age, group = cumsum(replace(temp == 0, is.na(temp), 0))) %>%
mutate(cumulative = replace(cumsum(replace(temp, is.na(temp), 0)),
is.na(temp), NA)) %>%
select(-temp, -group)


# group id age block trial condition accuracy cumulative
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0 1 1 1 1 1 0 NA
# 2 0 1 1 1 2 1 0 NA
# 3 0 1 1 2 1 1 0 1
# 4 0 1 1 2 2 1 0 2
# 5 0 1 1 2 3 1 0 3
# 6 0 1 1 2 4 2 0 NA
# 7 0 1 1 2 5 1 0 4
# 8 1 1 1 2 6 1 1 0
# 9 1 1 1 2 7 1 0 1
#10 1 1 1 2 8 1 0 2
#11 1 1 2 2 1 1 0 1


Related Topics



Leave a reply



Submit