R cumulative sum by condition with reset
Here's one way with ave
:
ave(df$a, cumsum(c(F, diff(df$a) < 0)), FUN=seq_along) - 1
[1] 0 1 2 3 0 1 2 3 4 5 6 0 1 0
We can get a running count grouped by diff(df$a) < 0
. Which are the positions in the vector that are less than their predecessors. We add c(F, ..)
to account for the first position. The cumulative sum of that vector creates an index for grouping. The function ave
can carry out a function on that index, we use seq_along
for a running count. But since it starts at 1, we subtract by one ave(...) - 1
to start from zero.
A similar approach using dplyr
:
library(dplyr)
df %>%
group_by(cumsum(c(FALSE, diff(a) < 0))) %>%
mutate(row_number() - 1)
dplyr / R cumulative sum with reset
I think you can use accumulate()
here to help. And i've also made a wrapper function to use for different thresholds
sum_reset_at <- function(thresh) {
function(x) {
accumulate(x, ~if_else(.x>=thresh, .y, .x+.y))
}
}
tib %>% mutate(c = sum_reset_at(5)(a))
# t a c
# <dbl> <dbl> <dbl>
# 1 1 2 2
# 2 2 3 5
# 3 3 1 1
# 4 4 2 3
# 5 5 2 5
# 6 6 3 3
tib %>% mutate(c = sum_reset_at(4)(a))
# t a c
# <dbl> <dbl> <dbl>
# 1 1 2 2
# 2 2 3 5
# 3 3 1 1
# 4 4 2 3
# 5 5 2 5
# 6 6 3 3
tib %>% mutate(c = sum_reset_at(6)(a))
# t a c
# <dbl> <dbl> <dbl>
# 1 1 2 2
# 2 2 3 5
# 3 3 1 6
# 4 4 2 2
# 5 5 2 4
# 6 6 3 7
cumsum by participant and reset on 0 R
Does this work?
library(dplyr)
library(data.table)
df %>%
mutate(grp = rleid(Correct)) %>%
group_by(Participant, grp) %>%
mutate(Count = cumsum(Correct)) %>%
select(- grp)
# A tibble: 10 x 4
# Groups: Participant, grp [6]
grp Participant Correct Count
<int> <chr> <dbl> <dbl>
1 1 A 1 1
2 1 A 1 2
3 1 A 1 3
4 2 A 0 0
5 3 A 1 1
6 3 B 1 1
7 3 B 1 2
8 4 B 0 0
9 5 B 1 1
10 5 B 1 2
Toy data:
df <- data.frame(
Participant = c(rep("A", 5), rep("B", 5)),
Correct = c(1,1,1,0,1,1,1,0,1,1)
)
R cumulative sum using dplyr with reset
library(dplyr)
data_right %>%
group_by(state, p) %>%
mutate(grp = cumsum(c(TRUE, diff(as.integer(Year)) > 1))) %>%
group_by(state, p, grp) %>%
mutate(cy = row_number()) %>%
ungroup() %>%
select(-grp)
# # A tibble: 12 x 5
# state p Year Consecutive_Yrs cy
# <chr> <chr> <chr> <dbl> <int>
# 1 NY n 1973 1 1
# 2 NY n 1974 2 2
# 3 NY n 1977 1 1
# 4 NY n 1978 2 2
# 5 NY p 1988 1 1
# 6 NY p 1989 2 2
# 7 PA n 1991 1 1
# 8 PA n 1992 2 2
# 9 PA n 1993 3 3
# 10 PA p 1920 1 1
# 11 PA p 1929 1 1
# 12 PA p 1931 1 1
Assumes the data is already ordered by Year
.
Data:
data_right <- data.table(state = c("NY", "NY", "NY", "NY", "NY","NY", "PA", "PA", "PA", "PA", "PA", "PA"), p = c("n", "n","n","n", "p", "p", "n", "n", "n", "p", "p", "p"),Year = c("1973", "1974", "1977", "1978", "1988", "1989" ,"1991", "1992", "1993", "1920", "1929", "1931"), Consecutive_Yrs = c(1,2,1,2,1,2,1,2,3,1,1,1))
Dpylr solution for cumsum with a factor reset
You can create a new group everytime OilChanged == 'Yes'
and take cumsum
of Diff
value in each group.
library(dplyr)
df %>%
group_by(grp = lag(cumsum(OilChanged == 'Yes'), default = 0)) %>%
mutate(newcumsum = cumsum(Diff)) %>%
ungroup %>%
select(-grp)
# OilChanged Odometer Diff CumSum newcumsum
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 No 300 0 0 0
# 2 No 350 50 50 50
# 3 Yes 410 60 110 110
# 4 No 420 10 10 10
# 5 No 430 10 20 20
# 6 No 450 20 40 40
# 7 No 500 50 90 90
# 8 No 600 100 190 190
# 9 No 600 0 190 190
#10 No 600 0 190 190
#11 No 650 50 240 240
#12 Yes 660 10 250 250
#13 No 700 40 40 40
cumsum with a condition to restart in R
You may use cumsum
to create groups as well.
library(dplyr)
df <- df %>%
group_by(group = cumsum(dplyr::lag(port == 0, default = 0))) %>%
mutate(cumsum_G = cumsum(G)) %>%
ungroup
df
# inv ass port G group cumsum_G
# <chr> <chr> <int> <int> <dbl> <int>
#1 i x 2 1 0 1
#2 i x 2 0 0 1
#3 i x 0 1 0 2
#4 i x 3 0 1 0
#5 i x 3 1 1 1
You may remove the group
column from output using %>% select(-group)
.
data
df <- structure(list(inv = c("i", "i", "i", "i", "i"), ass = c("x",
"x", "x", "x", "x"), port = c(2L, 2L, 0L, 3L, 3L), G = c(1L,
0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L))
R Cumulative Sum with a condition and a reset
This can be achieved by:
library(tidyverse)
library(data.table)
z %>%
group_by(rleid(Signal)) %>% #advance value every time Signal changes and group by that
mutate(cum = Signal*cumsum(Volume)) %>% #cumsum in each group
ungroup() %>% #ungroup so you could remove the grouping column
select(-4) #remove grouping column
or without data.table
by using rle
:
z %>%
mutate(rl = rep(1:length(rle(Signal)$length), times = rle(Signal)$length)) %>%
group_by(rl) %>%
mutate(cum = Signal*cumsum(Volume)) %>%
ungroup() %>%
select(-4)
#output
date Signal Volume cum
<fct> <int> <int> <int>
1 2016-01-04 NA 37912403 NA
2 2016-01-05 - 1 23258238 - 23258238
3 2016-01-06 - 1 25096183 - 48354421
4 2016-01-07 - 1 45172906 - 93527327
5 2016-01-08 - 1 35402298 -128929625
6 2016-01-11 - 1 29932385 -158862010
7 2016-01-12 - 1 28395390 -187257400
8 2016-01-13 - 1 33410553 -220667953
9 2016-01-14 - 1 48658623 -269326576
10 2016-01-15 1 46132781 46132781
11 2016-01-19 1 30998256 77131037
12 2016-01-20 - 1 59051429 - 59051429
13 2016-01-21 1 30518939 30518939
14 2016-01-22 1 30495387 61014326
15 2016-01-25 1 32482015 93496341
16 2016-01-26 - 1 26877080 - 26877080
17 2016-01-27 - 1 58699359 - 85576439
18 2016-01-28 1 107475327 107475327
19 2016-01-29 1 62739548 170214875
20 2016-02-01 1 46132726 216347601
data:
z <- read.table(text = "date Signal Volume
2016-01-04 NA 37912403
2016-01-05 -1 23258238
2016-01-06 -1 25096183
2016-01-07 -1 45172906
2016-01-08 -1 35402298
2016-01-11 -1 29932385
2016-01-12 -1 28395390
2016-01-13 -1 33410553
2016-01-14 -1 48658623
2016-01-15 1 46132781
2016-01-19 1 30998256
2016-01-20 -1 59051429
2016-01-21 1 30518939
2016-01-22 1 30495387
2016-01-25 1 32482015
2016-01-26 -1 26877080
2016-01-27 -1 58699359
2016-01-28 1 107475327
2016-01-29 1 62739548
2016-02-01 1 46132726", header = T)
Cumulative sum with reset option if multiple conditions are met
Contributing with a base-R solution:
df$amount_cumsum <- 0
df$count_cumsum <- 0
df$condition_met <- 0
reset = F
for (i in 1:nrow(df)) {
if (i == 1 | reset) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else if (df$id[i] != df$id[i-1]) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else {
df$amount_cumsum[i] = df$amount_cumsum[i-1] + df$amount[i]
df$count_cumsum[i] = df$count_cumsum[i-1] + df$count[i]
}
if (df$amount_cumsum[i] >= 10 & df$count_cumsum[i] >= 3) {
df$condition_met[i] = 1
reset = T
}
}
I've expanded your dataset and benchmarked this code against your solution. Benchmark shows the Base-R solution 21 times faster than the tidyverse one!
library(tidyverse)
dates = seq(as.Date("2019-01-01"), as.Date("2020-03-04"), by="days")
df <- data.frame(
date = c(sample(dates, 300), sample(dates, 400), sample(dates, 350)),
id = c(rep("A", 300), rep("B", 400), rep("C", 350)),
amount = floor(runif(1050, 0, 15)),
count = floor(runif(1050, 0, 5)),
stringsAsFactors = F
)
rbenchmark::benchmark(
"Tidy Solution" = {
df_tidy <- df %>%
group_by(id) %>%
nest(data = c(amount, count)) %>%
mutate(
data_accumulate = purrr::accumulate(.x = data, .f = function(.x, .y) if (max(.x[1]) < 10 | max(.x[2]) < 3) .x + .y else .y)
) %>%
unnest(cols = c(data_accumulate)) %>%
rename(amount_cumsum = amount, count_cumsum = count) %>%
unnest(cols = c(data)) %>%
mutate(condition_met = case_when(
amount_cumsum >= 10 & count_cumsum >= 3 ~ 1,
TRUE ~ 0)
)
},
"Base-R Solution" = {
df_base <- df
df_base$amount_cumsum <- 0
df_base$count_cumsum <- 0
df_base$condition_met <- 0
reset = F # to reset the counters
for (i in 1:nrow(df_base)) {
if (i == 1 | reset) {
df_base$amount_cumsum[i] = df_base$amount[i]
df_base$count_cumsum[i] = df_base$count[i]
reset = F
} else if (df_base$id[i] != df_base$id[i-1]) {
df_base$amount_cumsum[i] = df_base$amount[i]
df_base$count_cumsum[i] = df_base$count[i]
reset = F
} else {
df_base$amount_cumsum[i] = df_base$amount_cumsum[i-1] + df_base$amount[i]
df_base$count_cumsum[i] = df_base$count_cumsum[i-1] + df_base$count[i]
}
if (df_base$amount_cumsum[i] >= 10 & df_base$count_cumsum[i] >= 3) {
df_base$condition_met[i] = 1
reset = T
}
}
},
replications = 100)
gc()
test replications elapsed relative user.self sys.self user.child sys.child
Base-R Solution 100 3.89 1.000 3.69 0.0 NA NA
Tidy Solution 100 84.00 21.594 78.65 0.2 NA NA
Conditional running count (cumulative sum) with reset in R (dplyr)
We can use case_when
to assign the value which we need based on our conditions. We then add an additional group_by
condition using cumsum
to switch values when the temp
column 0. In the final mutate
step we temporarily replace
NA
values in temp
to 0, then take cumsum
over it and put back the NA
values again to it's place to get the final output.
library(dplyr)
mydata %>%
group_by(id, age) %>%
mutate(temp = case_when(accuracy == 0 & block == 2 & condition == 1 ~ 1,
accuracy == 1 & block == 2 & condition == 1 ~ 0,
TRUE ~ NA_real_)) %>%
ungroup() %>%
group_by(id, age, group = cumsum(replace(temp == 0, is.na(temp), 0))) %>%
mutate(cumulative = replace(cumsum(replace(temp, is.na(temp), 0)),
is.na(temp), NA)) %>%
select(-temp, -group)
# group id age block trial condition accuracy cumulative
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0 1 1 1 1 1 0 NA
# 2 0 1 1 1 2 1 0 NA
# 3 0 1 1 2 1 1 0 1
# 4 0 1 1 2 2 1 0 2
# 5 0 1 1 2 3 1 0 3
# 6 0 1 1 2 4 2 0 NA
# 7 0 1 1 2 5 1 0 4
# 8 1 1 1 2 6 1 1 0
# 9 1 1 1 2 7 1 0 1
#10 1 1 1 2 8 1 0 2
#11 1 1 2 2 1 1 0 1
Related Topics
Equivalent to Unix "Less" Command Within R Console
What Leads the First Element of a Printed List to Be Enclosed with Backticks in R V3.5.1
How to Plot a Stacked and Grouped Bar Chart in Ggplot
Should I Use a Data.Frame or a Matrix
How to Convert Data.Frame Column from Factor to Numeric
How to Multiply Data Frame by Vector
Plot One Numeric Variable Against N Numeric Variables in N Plots
Plotting a 3D Surface Plot with Contour Map Overlay, Using R
How to Get Coefficients and Their Confidence Intervals in Mixed Effects Models
Dplyr::Group_By_ with Character String Input of Several Variable Names
Adding New Columns to a Data.Table By-Reference Within a Function Not Always Working
How to Append Rows to an R Data Frame
How to Directly Select the Same Column from All Nested Lists Within a List
Convert Four Digit Year Values to Class Date
Display Weighted Mean by Group in the Data.Frame
Differencebetween Parent.Frame() and Parent.Env() in R; How Do They Differ in Call by Reference