Creating a function to run a conditional Sum in R
Using tidyverse
/ collapse
For arbitrary number of lead and lags the collapse
package offers a nice function flag
, which has further arguments to specify columns (cols
), or grouping variables g
.
library(dplyr)
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
transmute(Ones, Thats, gap = rowSums(., na.rm = T) - 1) %>%
filter(Ones == 1)
}
x <- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
# we can now specify how many lags to count:
f(x, 1)
Ones Thats gap
1 1 0 5
2 1 4 17
3 1 1 4
f(x, 2)
Ones Thats gap
1 1 0 8
2 1 4 29
3 1 1 16
Or if you want to specify the number of gaps to compute, we can simplify the function to
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2)) %>%
filter(Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
2 1 4 17 29
3 1 1 4 16
Base R
If you like terse functions:
f <- Vectorize(\(df, n) rowSums(collapse::flag(df, -n:n), na.rm = T) - 1, "n")
x[paste0("gap", 1:2)] <- f(x, 1:2) ; subset(x, Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
6 1 4 17 29
11 1 1 4 16
conditional sum of columns in R
df$solution <- rowSums(df[-1] * (df[,-1]>=df[,1]), na.rm = TRUE)
df
key A B C solution
1 0.5 0.7 0.7 0.1 1.4
2 0.8 0.6 0.8 NA 0.8
3 0.2 NA 0.9 NA 0.9
R: Conditional summing in R
One option is to extract the row where 'Name' is 'Matt', without the first column create a logical vector ('i1'), use that to subset the columns and get the rowSums
i1 <- df1[df1$Name == "Matt",-1] > 25
df1$Matt <- rowSums(df1[-1][,i1], na.rm = TRUE)
Or using tidyverse
library(dplyr)
df1 %>%
mutate(Matt = rowSums(select(cur_data(),
where(~ is.numeric(.) && .[Name == 'Matt'] > 25))))
-output
# Name score score.1 score.2 score.3 score.4 Matt
#1 Alex 31 15 18 22 23 68
#2 Pat 37 18 29 15 28 70
#3 Matt 33 27 18 88 9 148
#4 James 12 36 32 13 21 61
data
df1 <- structure(list(Name = c("Alex", "Pat", "Matt", "James"), score = c(31L,
37L, 33L, 12L), score.1 = c(15L, 18L, 27L, 36L), score.2 = c(18L,
29L, 18L, 32L), score.3 = c(22L, 15L, 88L, 13L), score.4 = c(23L,
28L, 9L, 21L)), class = "data.frame", row.names = c(NA, -4L))
How do I do a conditional sum which only looks between certain date criteria
Here's a dplyr
solution which will produce the desired result (14 rows) as specified in the question. Note that it takes care of duplicate date entries, for example, 2013-01-04 for user x.
# define a custom function to be used in the dplyr chain
myfunc <- function(x){
with(x, sapply(event_number, function(y)
sum(items_bought[event_number <= event_number[y] & date[y] - date <= 2])))
}
require(dplyr) #install and load into your library
df %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(.))) %>%
select(-c(items_bought, event_number))
# date user cum_items_bought_3_days
#1 2013-01-01 x 2
#2 2013-01-02 x 3
#3 2013-01-03 x 3
#4 2013-01-04 x 1
#5 2013-01-04 x 2
#6 2013-01-04 x 4
#7 2013-01-05 x 6
#8 2013-01-06 x 7
#9 2013-01-01 y 1
#10 2013-01-02 y 2
#11 2013-01-03 y 2
#12 2013-01-04 y 6
#13 2013-01-05 y 11
#14 2013-01-06 y 12
In my answer I use a custom function myfunc
inside a dplyr
chain. This is done using the do
operator from dplyr
. The custom function is passed the subsetted df by user
groups. It then uses sapply
to pass each event_number
and calculate the sums of items_bought
. The last line of the dplyr
chain deselects the undesired columns.
Let me know if you'd like a more detailed explanation.
Edit after comment by OP:
If you need more flexibility to also conditionally sum up other columns, you can adjust the code as follows. I assume here, that the other columns should be summed up the same way as items_bought
. If that is not correct, please specify how you want to sum up the other columns.
I first create two additional columns with random numbers in the data (I'll post a dput
of the data at the bottom of my answer):
set.seed(99) # for reproducibility only
df$newCol1 <- sample(0:10, 14, replace=T)
df$newCol2 <- runif(14)
df
# date user items_bought event_number newCol1 newCol2
#1 2013-01-01 x 2 1 6 0.687800094
#2 2013-01-02 x 1 2 1 0.640190769
#3 2013-01-03 x 0 3 7 0.357885360
#4 2013-01-04 x 0 4 10 0.102584999
#5 2013-01-04 x 1 5 5 0.097790922
#6 2013-01-04 x 2 6 10 0.182886256
#7 2013-01-05 x 3 7 7 0.227903474
#8 2013-01-06 x 1 8 3 0.080524150
#9 2013-01-01 y 1 1 3 0.821618422
#10 2013-01-02 y 1 2 1 0.591113977
#11 2013-01-03 y 0 3 6 0.773389019
#12 2013-01-04 y 5 4 5 0.350085977
#13 2013-01-05 y 6 5 2 0.006061323
#14 2013-01-06 y 1 6 7 0.814506223
Next, you can modify myfunc
to take 2 arguments, instead of 1. The first argument will remain the subsetted data.frame as before (represented by .
inside the dplyr chain and x
in the function definition of myfunc
), while the second argument to myfunc
will specify the column to sum up (colname
).
myfunc <- function(x, colname){
with(x, sapply(event_number, function(y)
sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}
Then, you can use myfunc
several times if you want to conditionally sum up several columns:
df %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
newCol1Sums = myfunc(., "newCol1"),
newCol2Sums = myfunc(., "newCol2"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))
# date user cum_items_bought_3_days newCol1Sums newCol2Sums
#1 2013-01-01 x 2 6 0.6878001
#2 2013-01-02 x 3 7 1.3279909
#3 2013-01-03 x 3 14 1.6858762
#4 2013-01-04 x 1 18 1.1006611
#5 2013-01-04 x 2 23 1.1984520
#6 2013-01-04 x 4 33 1.3813383
#7 2013-01-05 x 6 39 0.9690510
#8 2013-01-06 x 7 35 0.6916898
#9 2013-01-01 y 1 3 0.8216184
#10 2013-01-02 y 2 4 1.4127324
#11 2013-01-03 y 2 10 2.1861214
#12 2013-01-04 y 6 12 1.7145890
#13 2013-01-05 y 11 13 1.1295363
#14 2013-01-06 y 12 14 1.1706535
Now you created conditional sums of the columns items_bought
, newCol1
and newCol2
. You can also leave out any of the sums in the dplyr chain or add more columns to sum up.
Edit #2 after comment by OP:
To calculate the cumulative sum of distinct (unique) items bought per user, you could define a second custom function myfunc2
and use it inside the dplyr chain. This function is also flexible as myfunc
so that you can define the columns to which you want to apply the function.
The code would then be:
myfunc <- function(x, colname){
with(x, sapply(event_number, function(y)
sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}
myfunc2 <- function(x, colname){
cumsum(sapply(seq_along(x[[colname]]), function(y)
ifelse(!y == 1 & x[y, colname] %in% x[1:(y-1), colname], 0, 1)))
}
require(dplyr) #install and load into your library
dd %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
newCol1Sums = myfunc(., "newCol1"),
newCol2Sums = myfunc(., "newCol2"),
distinct_items_bought = myfunc2(., "items_bought"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))
Here is the data I used:
dput(df)
structure(list(date = structure(c(1L, 2L, 3L, 4L, 4L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("2013-01-01", "2013-01-02",
"2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"), class = "factor"),
user = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"),
items_bought = c(2L, 1L, 0L, 0L, 1L, 2L, 3L, 1L, 1L, 1L,
0L, 5L, 6L, 1L), event_number = c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L), newCol1 = c(6L, 1L, 7L,
10L, 5L, 10L, 7L, 3L, 3L, 1L, 6L, 5L, 2L, 7L), newCol2 = c(0.687800094485283,
0.640190769452602, 0.357885359786451, 0.10258499882184, 0.0977909218054265,
0.182886255905032, 0.227903473889455, 0.0805241498164833,
0.821618422167376, 0.591113976901397, 0.773389018839225,
0.350085976999253, 0.00606132275424898, 0.814506222726777
)), .Names = c("date", "user", "items_bought", "event_number",
"newCol1", "newCol2"), row.names = c(NA, -14L), class = "data.frame")
Conditional sum data in R
We create a logical vector based on the 'Name' column ('i1'), then use the OR (|
condition on the value 25 and -25 with relational operators (>
or <
respectively) to create a logical index for the columns. Subset the dataset based on the 'i2', and return the rowSums
of those columns and assign it to 'sum' column
i1 <-df1$Name == "Matt"
i2 <- df1[i1,-1] > 25|df1[i1,-1] < -25
df1$sum <- rowSums(df1[-1][,i2], na.rm = TRUE)
Or using dplyr
library(dplyr)
df1 %>%
mutate(Matt = rowSums(select(cur_data(),
where(~ is.numeric(.) &&
(.[Name == 'Matt'] > 25| .[Name == 'Matt'] < -25)))))
-output
# Name score score.1 score.2 score.3 score.4 score.5 score.6 Matt
#1 Alex 31 15 18 22 23 23 23 91
#2 Pat 37 18 29 15 28 28 -28 42
#3 Matt 33 27 18 88 9 -19 -29 119
#4 James 12 -36 32 13 21 21 21 10
Conditional sum in R w NA values?
library(data.table)
# dummy data
df <- data.table(id = rep(c(1,2), times=c(4,3))
, assay_date = c('20mar2021', '06jun2021', '24sep2021', '19nov2021', '29apr2021', '23may2021', '15jun2021')
, dose_3_date = rep(c('22feb2021', '02apr2021'), times=c(4,3))
, dose_4_date = c(rep(c('17aug2021', NA), times=c(4,3)))
); df
# set as data.table if yours isn't one already
setDT(df)
# as.Date
x <- c("assay_date", "dose_3_date", "dose_4_date")
df[, (x) := lapply(.SD, \(i) as.Date(i, format="%d%b%Y")), .SDcols=x
][, date_diff := assay_date - dose_3_date # calculate date diff
]
# flag rows which fit criteria
df[date_diff %between% c(14, 45)
& (assay_date <= dose_4_date
| is.na(dose_4_date)
)
, fits_criteria := 1
]
# count per patient
df[, .(assays_in_period = sum(fits_criteria, na.rm=T)), id]
id assays_in_period
1: 1 1
2: 2 1
how to sum conditional functions to grouped rows in R
An option using dplyr
library(dplyr)
df %>%
group_by(customerid) %>%
summarise(
effectivity = sum(
charges[payment_date <= 21 & payment_month == bill_month]) / sum(charges) * 100,
.groups = "drop")
## A tibble: 3 x 2
#customerid effectivity
# <int> <dbl>
#1 1 59.9
#2 2 100
#3 3 37.5
Related Topics
Car::Scatter3D in R - Labeling Axis Better
How to Automatically Load Data in an R Package
R: Compare All the Columns Pairwise in Matrix
R: Adding a "Tool Tip" to Interactive Plot (Plotly)
Add a Dynamic Value into Rmysql Getquery
Tidyr::Pivot_Wider() Reorder Column Names Grouping by 'Name_From'
How to Clear an Na Flag for a Posix Value
R - Unable to Install R Packages - Cannot Open the Connection
Arranging Rows in Custom Order Using Dplyr
Adding Multiple Lag Variables Using Dplyr and for Loops
How to See All Rows of a Data Frame in a Jupyter Notebook with an R Kernel
Expression + Variable Value + Normal Text in Plot Maintitle
Change the Order of Stacked Fill Columns in Ggplot2
Append Multiple CSV Files into One File Using R
Create a Reactive Function Outside the Shiny App
R Windows Os Choose.Dir() File Chooser Won't Open at Working Directory