Conditional Summing (R)

Creating a function to run a conditional Sum in R

Using tidyverse / collapse

For arbitrary number of lead and lags the collapse package offers a nice function flag, which has further arguments to specify columns (cols), or grouping variables g.

library(dplyr)
f <- function(df, n){
  df %>%
    collapse::flag(-n:n) %>%
    transmute(Ones, Thats, gap = rowSums(., na.rm = T) - 1) %>%
    filter(Ones == 1)
}

x <- data.frame (
  'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0), 
  'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))

# we can now specify how many lags to count:
f(x, 1) 
  Ones Thats gap
1    1     0   5
2    1     4  17
3    1     1   4
f(x, 2)
  Ones Thats gap
1    1     0   8
2    1     4  29
3    1     1  16

Or if you want to specify the number of gaps to compute, we can simplify the function to

f <- function(df, n){
  df %>%
    collapse::flag(-n:n) %>%
    rowSums(na.rm = T) - 1
}

x %>%
  mutate(gap1 = f(., 1),
         gap2 = f(., 2)) %>%
  filter(Ones == 1)
  Ones Thats gap1 gap2
1    1     0    5    8
2    1     4   17   29
3    1     1    4   16

Base R
If you like terse functions:

f <- Vectorize(\(df, n) rowSums(collapse::flag(df, -n:n), na.rm = T) - 1, "n")
x[paste0("gap", 1:2)] <- f(x, 1:2) ; subset(x, Ones == 1)
   Ones Thats gap1 gap2
1     1     0    5    8
6     1     4   17   29
11    1     1    4   16

conditional sum of columns in R

df$solution <- rowSums(df[-1] * (df[,-1]>=df[,1]), na.rm = TRUE)
df
  key   A   B   C solution
1 0.5 0.7 0.7 0.1      1.4
2 0.8 0.6 0.8  NA      0.8
3 0.2  NA 0.9  NA      0.9

R: Conditional summing in R

One option is to extract the row where 'Name' is 'Matt', without the first column create a logical vector ('i1'), use that to subset the columns and get the rowSums

i1 <- df1[df1$Name == "Matt",-1] > 25
df1$Matt <- rowSums(df1[-1][,i1], na.rm = TRUE)

Or using tidyverse

library(dplyr)
df1 %>% 
   mutate(Matt = rowSums(select(cur_data(), 
            where(~ is.numeric(.) && .[Name == 'Matt'] > 25))))

-output

#   Name score score.1 score.2 score.3 score.4 Matt
#1  Alex    31      15      18      22      23   68
#2   Pat    37      18      29      15      28   70
#3  Matt    33      27      18      88       9  148
#4 James    12      36      32      13      21   61

data

df1 <- structure(list(Name = c("Alex", "Pat", "Matt", "James"), score = c(31L, 
37L, 33L, 12L), score.1 = c(15L, 18L, 27L, 36L), score.2 = c(18L, 
29L, 18L, 32L), score.3 = c(22L, 15L, 88L, 13L), score.4 = c(23L, 
28L, 9L, 21L)), class = "data.frame", row.names = c(NA, -4L))

How do I do a conditional sum which only looks between certain date criteria

Here's a dplyr solution which will produce the desired result (14 rows) as specified in the question. Note that it takes care of duplicate date entries, for example, 2013-01-04 for user x.

# define a custom function to be used in the dplyr chain
myfunc <- function(x){
  with(x, sapply(event_number, function(y) 
    sum(items_bought[event_number <= event_number[y] & date[y] - date <= 2])))
}

require(dplyr)                 #install and load into your library

df %>%
  mutate(date = as.Date(as.character(date))) %>%
  group_by(user) %>%
  do(data.frame(., cum_items_bought_3_days = myfunc(.))) %>%
  select(-c(items_bought, event_number))

#         date user cum_items_bought_3_days
#1  2013-01-01    x                       2
#2  2013-01-02    x                       3
#3  2013-01-03    x                       3
#4  2013-01-04    x                       1
#5  2013-01-04    x                       2
#6  2013-01-04    x                       4
#7  2013-01-05    x                       6
#8  2013-01-06    x                       7
#9  2013-01-01    y                       1
#10 2013-01-02    y                       2
#11 2013-01-03    y                       2
#12 2013-01-04    y                       6
#13 2013-01-05    y                      11
#14 2013-01-06    y                      12

In my answer I use a custom function myfunc inside a dplyr chain. This is done using the do operator from dplyr. The custom function is passed the subsetted df by user groups. It then uses sapply to pass each event_number and calculate the sums of items_bought. The last line of the dplyr chain deselects the undesired columns.

Let me know if you'd like a more detailed explanation.

Edit after comment by OP:

If you need more flexibility to also conditionally sum up other columns, you can adjust the code as follows. I assume here, that the other columns should be summed up the same way as items_bought. If that is not correct, please specify how you want to sum up the other columns.

I first create two additional columns with random numbers in the data (I'll post a dput of the data at the bottom of my answer):

set.seed(99)   # for reproducibility only

df$newCol1 <- sample(0:10, 14, replace=T)
df$newCol2 <- runif(14)

df
#         date user items_bought event_number newCol1     newCol2
#1  2013-01-01    x            2            1       6 0.687800094
#2  2013-01-02    x            1            2       1 0.640190769
#3  2013-01-03    x            0            3       7 0.357885360
#4  2013-01-04    x            0            4      10 0.102584999
#5  2013-01-04    x            1            5       5 0.097790922
#6  2013-01-04    x            2            6      10 0.182886256
#7  2013-01-05    x            3            7       7 0.227903474
#8  2013-01-06    x            1            8       3 0.080524150
#9  2013-01-01    y            1            1       3 0.821618422
#10 2013-01-02    y            1            2       1 0.591113977
#11 2013-01-03    y            0            3       6 0.773389019
#12 2013-01-04    y            5            4       5 0.350085977
#13 2013-01-05    y            6            5       2 0.006061323
#14 2013-01-06    y            1            6       7 0.814506223

Next, you can modify myfunc to take 2 arguments, instead of 1. The first argument will remain the subsetted data.frame as before (represented by . inside the dplyr chain and x in the function definition of myfunc), while the second argument to myfunc will specify the column to sum up (colname).

myfunc <- function(x, colname){
  with(x, sapply(event_number, function(y) 
    sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}

Then, you can use myfunc several times if you want to conditionally sum up several columns:

df %>%
  mutate(date = as.Date(as.character(date))) %>%
  group_by(user) %>%
  do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
                   newCol1Sums = myfunc(., "newCol1"),            
                   newCol2Sums = myfunc(., "newCol2"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))

#         date user cum_items_bought_3_days newCol1Sums newCol2Sums
#1  2013-01-01    x                       2           6   0.6878001
#2  2013-01-02    x                       3           7   1.3279909
#3  2013-01-03    x                       3          14   1.6858762
#4  2013-01-04    x                       1          18   1.1006611
#5  2013-01-04    x                       2          23   1.1984520
#6  2013-01-04    x                       4          33   1.3813383
#7  2013-01-05    x                       6          39   0.9690510
#8  2013-01-06    x                       7          35   0.6916898
#9  2013-01-01    y                       1           3   0.8216184
#10 2013-01-02    y                       2           4   1.4127324
#11 2013-01-03    y                       2          10   2.1861214
#12 2013-01-04    y                       6          12   1.7145890
#13 2013-01-05    y                      11          13   1.1295363
#14 2013-01-06    y                      12          14   1.1706535

Now you created conditional sums of the columns items_bought, newCol1 and newCol2. You can also leave out any of the sums in the dplyr chain or add more columns to sum up.

Edit #2 after comment by OP:

To calculate the cumulative sum of distinct (unique) items bought per user, you could define a second custom function myfunc2 and use it inside the dplyr chain. This function is also flexible as myfunc so that you can define the columns to which you want to apply the function.

The code would then be:

myfunc <- function(x, colname){
  with(x, sapply(event_number, function(y) 
    sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}

myfunc2 <- function(x, colname){
  cumsum(sapply(seq_along(x[[colname]]), function(y) 
    ifelse(!y == 1 & x[y, colname] %in% x[1:(y-1), colname], 0, 1)))
}

require(dplyr)                 #install and load into your library

dd %>%
  mutate(date = as.Date(as.character(date))) %>%
  group_by(user) %>%
  do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
                   newCol1Sums = myfunc(., "newCol1"),
                   newCol2Sums = myfunc(., "newCol2"),
                   distinct_items_bought = myfunc2(., "items_bought"))) %>%   
  select(-c(items_bought, event_number, newCol1, newCol2))

Here is the data I used:

dput(df)
structure(list(date = structure(c(1L, 2L, 3L, 4L, 4L, 4L, 5L, 
6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("2013-01-01", "2013-01-02", 
"2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"), class = "factor"), 
user = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"), 
items_bought = c(2L, 1L, 0L, 0L, 1L, 2L, 3L, 1L, 1L, 1L, 
0L, 5L, 6L, 1L), event_number = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L), newCol1 = c(6L, 1L, 7L, 
10L, 5L, 10L, 7L, 3L, 3L, 1L, 6L, 5L, 2L, 7L), newCol2 = c(0.687800094485283, 
0.640190769452602, 0.357885359786451, 0.10258499882184, 0.0977909218054265, 
0.182886255905032, 0.227903473889455, 0.0805241498164833, 
0.821618422167376, 0.591113976901397, 0.773389018839225, 
0.350085976999253, 0.00606132275424898, 0.814506222726777
)), .Names = c("date", "user", "items_bought", "event_number", 
"newCol1", "newCol2"), row.names = c(NA, -14L), class = "data.frame")

Conditional sum data in R

We create a logical vector based on the 'Name' column ('i1'), then use the OR (| condition on the value 25 and -25 with relational operators (> or < respectively) to create a logical index for the columns. Subset the dataset based on the 'i2', and return the rowSums of those columns and assign it to 'sum' column

i1 <-df1$Name == "Matt" 

i2 <- df1[i1,-1] > 25|df1[i1,-1] < -25
df1$sum <- rowSums(df1[-1][,i2], na.rm = TRUE)

Or using dplyr

library(dplyr)
df1 %>% 
   mutate(Matt = rowSums(select(cur_data(), 
            where(~ is.numeric(.) &&
          (.[Name == 'Matt'] > 25| .[Name == 'Matt'] < -25)))))