Cumulative sum until maximum reached, then repeat from zero in the next row
I think this is best done with a for loop, can't think of a function that could do so out of the box. The following should do what you want (if I understand you correctly).
current.sum <- 0
for (c in 1:nrow(caribou.sub)) {
current.sum <- current.sum + caribou.sub[c, "difference"]
carribou.sub[c, "difference_sum"] <- current.sum
if (current.sum >= 1470) {
caribou.sub[c, "keep"] <- 1
current.sum <- 0
}
}
Feel free to comment if it does not exactly what you want. But as pointed out by alexwhan, your description is not completely clear.
R: Sum until 0 is reached and then restart
Here is a data.table
option. Your grouping variable should not be Run
but rleid(Run)
.
library(data.table)
dt <- fread(text)
dt[, value := cumsum(Appliance), by = rleid(Run)]
dt
# V1 Home Date Time Appliance Run value
# 1: 679 2 1/21/2017 1:30:00 0 1 0
# 2: 680 2 1/21/2017 1:45:00 0 1 0
# 3: 681 2 1/21/2017 2:00:00 0 1 0
# 4: 682 2 1/21/2017 2:15:00 0 1 0
# 5: 683 2 1/21/2017 2:30:00 804 0 804
# 6: 684 2 1/21/2017 2:45:00 556 0 1360
# 7: 685 2 1/21/2017 3:00:00 844 0 2204
# 8: 686 2 1/21/2017 3:15:00 396 0 2600
# 9: 687 2 1/21/2017 3:30:00 392 0 2992
#10: 688 2 1/21/2017 3:45:00 1220 0 4212
#11: 689 2 1/21/2017 4:00:00 0 1 0
#12: 690 2 1/21/2017 4:15:00 0 1 0
#13: 691 2 1/21/2017 4:30:00 0 1 0
#14: 692 2 1/21/2017 4:45:00 0 1 0
#15: 783 2 1/22/2017 3:30:00 0 1 0
#16: 784 2 1/22/2017 3:45:00 244 0 244
#17: 785 2 1/22/2017 4:00:00 1068 0 1312
#18: 786 2 1/22/2017 4:15:00 44 0 1356
#19: 787 2 1/22/2017 4:30:00 1240 0 2596
#20: 788 2 1/22/2017 4:45:00 40 0 2636
#21: 789 2 1/22/2017 5:00:00 1608 0 4244
#22: 790 2 1/22/2017 5:15:00 0 1 0
#23: 791 2 1/22/2017 5:30:00 0 1 0
#24: 792 2 1/22/2017 5:45:00 0 1 0
#25: 793 2 1/22/2017 6:00:00 0 1 0
#26: 794 2 1/22/2017 6:15:00 0 1 0
#27: 795 2 1/22/2017 6:30:00 0 1 0
#28: 796 2 1/22/2017 6:45:00 0 1 0
#29: 797 2 1/22/2017 7:00:00 0 1 0
#30: 798 2 1/22/2017 7:15:00 0 1 0
In base R
we can do
df1 <- read.table(text = text, stringsAsFactors = FALSE, header = TRUE)
rle_Run <- rle(df1$Run)
df1$value <- with(df1, ave(Appliance, rep(seq_along(rle_Run$lengths), rle_Run$lengths), FUN = cumsum))
data
text <- " Home Date Time Appliance Run value
679 2 1/21/2017 1:30:00 0 1 0
680 2 1/21/2017 1:45:00 0 1 0
681 2 1/21/2017 2:00:00 0 1 0
682 2 1/21/2017 2:15:00 0 1 0
683 2 1/21/2017 2:30:00 804 0 1
684 2 1/21/2017 2:45:00 556 0 804
685 2 1/21/2017 3:00:00 844 0 1360
686 2 1/21/2017 3:15:00 396 0 2204
687 2 1/21/2017 3:30:00 392 0 2600
688 2 1/21/2017 3:45:00 1220 0 2992
689 2 1/21/2017 4:00:00 0 1 0
690 2 1/21/2017 4:15:00 0 1 0
691 2 1/21/2017 4:30:00 0 1 0
692 2 1/21/2017 4:45:00 0 1 0
783 2 1/22/2017 3:30:00 0 1 0
784 2 1/22/2017 3:45:00 244 0 4212
785 2 1/22/2017 4:00:00 1068 0 4456
786 2 1/22/2017 4:15:00 44 0 5524
787 2 1/22/2017 4:30:00 1240 0 5568
788 2 1/22/2017 4:45:00 40 0 6808
789 2 1/22/2017 5:00:00 1608 0 6848
790 2 1/22/2017 5:15:00 0 1 0
791 2 1/22/2017 5:30:00 0 1 0
792 2 1/22/2017 5:45:00 0 1 0
793 2 1/22/2017 6:00:00 0 1 0
794 2 1/22/2017 6:15:00 0 1 0
795 2 1/22/2017 6:30:00 0 1 0
796 2 1/22/2017 6:45:00 0 1 0
797 2 1/22/2017 7:00:00 0 1 0
798 2 1/22/2017 7:15:00 0 1 0"
Rolling Cummulative Sum of a Column's Values Until A Condition is Met
You could try using this for loop:
lastvalue = 0
newcum = []
for i in df['a']:
if lastvalue >= 5:
lastvalue = i
else:
lastvalue += i
newcum.append(lastvalue)
df['a_cum_sum'] = newcum
print(df)
Output:
a a_cum_sum
0 2 2
1 3 5
2 0 0
3 5 5
4 1 1
5 3 4
6 1 5
7 2 2
8 2 4
9 1 5
The above for loop iterates through the a
column, and when the cumulative sum is 5 or more, it resets it to 0
then adds the a
column's value i
, but if the cumulative sum is lower than 5, it just adds the a
column's value i
(the iterator).
R how to cumulative sums up until condition, including the row where the condition changes
We can create a grouping column based on the logical column by taking the cumulative sum and getting the lag
of that output, then do the cumsum
on the column 'b'
library(dplyr)
df1 %>%
group_by(grp = lag(cumsum(a), default = 0)) %>%
mutate(c = row_number(), d = cumsum(b)) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 7 x 4
# a b c d
# <lgl> <dbl> <int> <dbl>
#1 FALSE 30.5 1 30.5
#2 FALSE 27.8 2 58.3
#3 FALSE 26.9 3 85.3
#4 TRUE 41.7 4 127.
#5 FALSE 2.86 1 2.86
#6 FALSE 16.3 2 19.2
#7 TRUE 40.2 3 59.4
Or using data.table
with the same logic, grouped by the shift
of cumulative sum of 'a', create the 'd' column as the cumsum
of 'b',
library(data.table)
setDT(df1)[, c('c', 'd') := .(1:.N, cumsum(b)),
.(grp = shift(cumsum(a), fill = 0))]
data
df1 <- structure(list(a = c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
TRUE), b = c(30.53, 27.8, 26.93, 41.66, 2.86, 16.31, 40.19)),
class = "data.frame", row.names = c(NA,
-7L))
Sum until a given value is reached
You can use cumsum(mydata$Col1 == 4) == 0
to get a logical vector of whether or not 4 has been reached in Col1. Then you can use simple indexing to grab the relevant elements from Col2:
sum(mydata$Col2[cumsum(mydata$Col1 == 4) == 0])
# [1] 40.5
Rolling sum till a certain value is reached, plus calculated duration
A way to solve this efficiently is a procedural solution with two cursors:
One explicit cursor and another implicit cursor of the FOR
loop:
CREATE OR REPLACE FUNCTION foo()
RETURNS TABLE (dt timestamp
, val real
, sum_value real
, time_at_sum timestamp
, duration interval) AS
$func$
DECLARE
_bound real := 1.0; -- your bound here
cur CURSOR FOR SELECT * FROM sample s ORDER BY s.dt; -- in chronological order
s sample; -- cursor row
BEGIN
OPEN cur;
FETCH cur INTO time_at_sum, sum_value; -- fetch first row into target
FOR dt, val IN -- primary pass over table
SELECT x.dt, x.value FROM sample x ORDER BY s.dt
LOOP
WHILE sum_value <= _bound LOOP
FETCH cur INTO s;
IF NOT FOUND THEN -- end of table
sum_value := NULL; time_at_sum := NULL;
EXIT; -- exits inner loop
END IF;
sum_value := sum_value + s.value;
END LOOP;
IF sum_value > _bound THEN -- to catch end-of-table
time_at_sum := s.dt;
END IF;
duration := time_at_sum - dt;
RETURN NEXT;
sum_value := sum_value - val; -- subtract previous row before moving on
END LOOP;
END
$func$ LANGUAGE plpgsql;
Call:
SELECT * FROM foo();
db<>fiddle here
Should perform nicely since it only needs 2 scans over the table.
Note that I implemented > _bound
like your description requires, not >= _bound
like your result indicates. Easy to change either way.
Assumes the value column to be NOT NULL
.
Related:
- Window Functions or Common Table Expressions: count previous rows within range
CumSum until certain value reached and then stop and print 'MAX'
I got something like this now:
def cumsum(x):
q = np.array(range (0,x))
z = q.cumsum()
v = np.clip(z, 0, 1000)
solution = ['MAX VALUE REACHED' if i == 1000 else i for i in v]
if x == 0:
return[0]
else:
return solution
My output is this
[0,
1,
3,
6,
10,
15,
21,
28,
36,
45,
55,
66,
78,
91,
105,
120,
136,
153,
171,
190,
210,
231,
253,
276,
300,
325,
351,
378,
406,
435,
465,
496,
528,
561,
595,
630,
666,
703,
741,
780,
820,
861,
903,
946,
990,
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED',
'MAX VALUE REACHED']
Now I have to stop the calculation after one time 'MAX VALUE REACHED'
Any sugestions?
Proposed alternative
Rather than using clip
create a new list that contains only the cumsum
items that are less than 1,000, and when you encounter an item that is at least 1,000 break out of the loop.
import numpy as np
q = np.array(range(101))
z = q.cumsum()
temp = []
for _ in z:
if _ < 1000:
temp.append(_)
else:
break
solution = np.array(temp)
print (solution)
print('MAX VALUE REACHED')
Output:
[ 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153
171 190 210 231 253 276 300 325 351 378 406 435 465 496 528 561 595 630
666 703 741 780 820 861 903 946 990]
MAX VALUE REACHED
Cumulative sum in pandas starting with a zero and ending with the sum of all but the last entry respecting groups
Another option is to use .groupby()
twice, as follows:
Take the DataFrameGroupBy.shift()
value of B
under A
so that for each group of A
, the first entry of B
will be reset and become NaN
for later .fillna()
to 0
.
Further grouped by A
for GroupBy.cumsum()
within the local sequence of A
to get the desired output:
df['C'] = (df.groupby('A')['B'].shift()
.groupby(df['A']).cumsum()
.fillna(0, downcast='infer')
)
This solution is vectorized as well as supporting non-contiguous groups too!
Result:
print(df)
A B C
0 1 5 0
1 1 6 5
2 2 3 0
3 2 4 3
4 2 5 7
5 3 2 0
5 3 7 2
6 4 3 0
Edit
If you are to groupby more than one column and you got "KeyError", check whether your syntax is correctly entered, for example:
If you groupby 2 columns year
and income
, you can use:
df['C'] = (df.groupby(['year', 'income'])['B'].shift()
.groupby([df['year'], df['income']]).cumsum()
.fillna(0, downcast='infer')
)
Pandas supports both syntax with or without quoting df
passing parameter to .groupby()
. However, for any groupby()
that the entity being grouped is not df
itself, we may not be able to use the abbreviated form to quote just the column labels only e.g. 'year'
, we need to use the full column name e.g. df['year']
instead.
Cumulative sum within group till threshold is reached
You can take the cumulative sum for each ID
and select rows until the value becomes 15.
This can be done in base R :
subset(df, as.logical(ave(No_ind, ID, FUN = function(x)
seq_along(x) <= which.max(cumsum(x) >= 15))))
# month ID No_ind
#1 Jun 1 8
#2 Jul 1 5
#3 Aug 1 2
#4 Oct 2 15
Or in dplyr
library(dplyr)
df %>% group_by(ID) %>% slice(seq_len(which.max(cumsum(No_ind) >= 15)))
and data.table
:
library(data.table)
setDT(df)[, .SD[seq_len(which.max(cumsum(No_ind) >= 15))], ID]
data
Make sure numbers are treated as numbers and not as strings.
df <- type.convert(df, as.is = TRUE)
Related Topics
How to Parametrize Function Calls in Dplyr 0.7
Find Which Interval Row in a Data Frame That Each Element of a Vector Belongs In
How to Parse Year + Week Number in R
R - Emulate the Default Behavior of Hist() with Ggplot2 for Bin Width
What Are the Differences Between Community Detection Algorithms in Igraph
Inserting a Table Under the Legend in a Ggplot2 Histogram
How to Group My Date Variable into Month/Year in R
Changing the Line Type in the Ggplot Legend
Listing Contents of an R Data File Without Loading
Extract a Column from a Data.Table as a Vector, by Position
Find the N Most Common Values in a Vector
R Package Lattice Won't Plot If Run Using Source()
Select Rows of a Matrix That Meet a Condition
Extract Prediction Band from Lme Fit
How to Format Axis Labels with Exponents with Ggplot2 and Scales