Calculate Days Since Last Event in R

Calculate days since last event in R

You could try something like this:

# make an index of the latest events
last_event_index <- cumsum(df$event) + 1

# shift it by one to the right
last_event_index <- c(1, last_event_index[1:length(last_event_index) - 1])

# get the dates of the events and index the vector with the last_event_index,
# added an NA as the first date because there was no event
last_event_date <- c(as.Date(NA), df[which(df$event==1), "date"])[last_event_index]

# substract the event's date with the date of the last event
df$tae <- df$date - last_event_date
df

# date event tae
#1 2000-07-06 0 NA days
#2 2000-09-15 0 NA days
#3 2000-10-15 1 NA days
#4 2001-01-03 0 80 days
#5 2001-03-17 1 153 days
#6 2001-05-23 1 67 days
#7 2001-08-26 0 95 days

R: Days since last event per ID

df <- data.table(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001","06/08/2010","15/09/2010","15/10/2010","03/01/2011","17/03/2011"), 
"%d/%m/%Y"), event=c(0,0,1,0,1, 1,0,1,0,1),id = c(rep(1,5),rep(2,5)))

tempdt <- df[event==1,]

tempdt[,tae := date - shift(date), by = id]

df <- merge(df, tempdt, by = c("date", "event", "id"), all.x = TRUE)

df[, tae := ifelse(shift(event)==1, date - shift(date), tae), by = id]

EDIT

More general solution

df <- data.table(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001", "18/03/2001",
"06/08/2010","15/09/2010","15/10/2010","03/01/2011","17/03/2011","19/03/2011"),
"%d/%m/%Y"),
event=c(1,0,0,0,0,0,1,1,1,0,1,0),id = c(rep(1,6),rep(5,6)))

##for event = 1 observations
tempdt <- df[event==1,]

tempdt[,tae := date - shift(date), by = id]

df <- merge(df, tempdt, by = c("date", "event", "id"), all.x = TRUE)

##for event = 0 observations
for(d in df[event==0, date]){
# print(as.Date(d, origin = "1970-01-01"))
df[date == d & event == 0, tae := as.Date(d, origin = "1970-01-01") -
max(df[date<d & event==1,date]), by = id]
}

EDIT 2
Now, there must be a faster way to do this, but if first observation is event = 0, this won't result in any warning

df <- data.table(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001","06/08/2010","15/09/2010","15/10/2010","03/01/2011","17/03/2011"),
"%d/%m/%Y"), event=c(0,0,1,0,1, 1,0,0,0,1),id = c(rep(1,5),rep(2,5)))

tempdt <- df[event==1,]

tempdt[,tae := date - shift(date), by = id]

df <- merge(df, tempdt, by = c("date", "event", "id"), all.x = TRUE)

for(i in unique(df[,id])){
# print(i)
for(d in df[date>df[id == i & event==1,min(date)] & event==0, date]){
# print(as.Date(d, origin = "1970-01-01"))
df[id == i & date == d & event == 0,
tae := as.Date(d, origin = "1970-01-01") - max(df[date<d &
event==1,date])]
}
}

Calculate number of days passed since previous date within each group

Try the following. Small changes to your code make a difference.

suppressPackageStartupMessages(library(dplyr))

data %>%
mutate(date = as.Date(date)) %>%
group_by(group)%>%
arrange(group, date) %>%
mutate(Difference = difftime(date, lag(date), units = "days"))
#> # A tibble: 10 × 4
#> # Groups: group [2]
#> year group date Difference
#> <int> <chr> <date> <drtn>
#> 1 2019 A 2019-07-15 NA days
#> 2 2019 A 2019-07-25 10 days
#> 3 2019 A 2019-08-01 7 days
#> 4 2020 A 2020-08-01 366 days
#> 5 2020 A 2020-08-03 2 days
#> 6 2019 B 2019-07-15 NA days
#> 7 2019 B 2019-07-30 15 days
#> 8 2020 B 2020-08-01 368 days
#> 9 2020 B 2020-08-20 19 days
#> 10 2020 B 2020-08-25 5 days

Created on 2022-06-10 by the reprex package (v2.0.1)

R - Calculate Time Elapsed Since Last Event with Multiple Event Types

The base R version of this is to use split/lapply/rbind to generate the new column.

> do.call(rbind,
lapply(
split(df, df$event_type),
function(d) {
d$dsle <- c(NA, diff(d$date)); d
}
)
)
date event_type dsle
0.1 2000-07-06 0 NA
0.7 2001-05-26 0 324
1.3 2000-10-15 1 NA
1.6 2001-04-23 1 190
2.4 2001-01-03 2 NA
2.8 2001-06-01 2 149
3.9 2001-06-30 3 NA
3.10 2001-07-02 3 2
3.12 2001-12-21 3 172
4.2 2000-09-15 4 NA
4.5 2001-03-17 4 183
4.11 2001-07-15 4 120

Note that this returns the data in a different order than provided; you can re-sort by date or save the original indices if you want to preserve that order.

Above, @akrun has posted the data.tables approach, the parallel dplyr approach would be straightforward as well:

library(dplyr)
df %>% group_by(event_type) %>% mutate(days_since_last_event=date - lag(date, 1))

Source: local data frame [12 x 3]
Groups: event_type [5]

         date event_type days_since_last_event
(date) (dbl) (dfft)
1 2000-07-06 0 NA days
2 2000-09-15 4 NA days
3 2000-10-15 1 NA days
4 2001-01-03 2 NA days
5 2001-03-17 4 183 days
6 2001-04-23 1 190 days
7 2001-05-26 0 324 days
8 2001-06-01 2 149 days
9 2001-06-30 3 NA days
10 2001-07-02 3 2 days
11 2001-07-15 4 120 days
12 2001-12-21 3 172 days

R - Calculate Time Elapsed Since Last Events with Multiple Event Types and IDs

The following uses the Chron Library to calculate difference in the dates

library(chron)

df$date <- chron(as.character(df$date),format=c(date="y-m-d"))

for(j in unique(df$id)){
DaysSince1 <-NA
DaysSince2 <-NA
RowsWithID <- grep(j,df$id)

for(i in RowsWithID){
df$days_since_event_1[i] <- df$date[i]-df$date[i-DaysSince1]
df$days_since_event_2[i] <- df$date[i]-df$date[i-DaysSince2]

if(df$event[i]==1){DaysSince1<-1}
else{DaysSince1<-DaysSince1+1}

if(df$event[i]==2){DaysSince2<-1}
else{DaysSince2<-DaysSince2+1}
}
}

This code gives the following results

> df
date event id days_since_event_1 days_since_event_2
1 00-07-06 2 1 NA NA
2 00-07-07 1 1 NA 1
3 00-07-09 0 1 2 3
4 00-07-10 0 1 3 4
5 00-07-15 2 1 8 9
6 00-07-16 1 1 9 1
7 00-07-20 0 1 4 5
8 00-07-21 1 1 5 6
9 00-07-06 1 2 NA NA
10 00-07-07 2 2 1 NA
11 00-07-15 0 2 9 8
12 00-07-16 0 2 10 9
13 00-07-17 2 2 11 10
14 00-07-18 1 2 12 1

To address you comment, you can do the following in Base R to get the number of observations rather than days. No Libraries needed.

for(j in unique(df$id)){
ObsSince1 <-NA
ObsSince2 <-NA
RowsWithID <- grep(j,df$id)

for(i in RowsWithID){
df$Obs_since_event_1[i] <- ObsSince1
df$Obs_since_event_2[i] <- ObsSince2

if(df$event[i]==1){ObsSince1<-1}
else{ObsSince1<-ObsSince1+1}

if(df$event[i]==2){ObsSince2<-1}
else{ObsSince2<-ObsSince2+1}
}
}

You should get the following output

> df
date event id Obs_since_event_1 Obs_since_event_2
1 2000-07-06 2 1 NA NA
2 2000-07-07 1 1 NA 1
3 2000-07-09 0 1 1 2
4 2000-07-10 0 1 2 3
5 2000-07-15 2 1 3 4
6 2000-07-16 1 1 4 1
7 2000-07-20 0 1 1 2
8 2000-07-21 1 1 2 3
9 2000-07-06 1 2 NA NA
10 2000-07-07 2 2 1 NA
11 2000-07-15 0 2 2 1
12 2000-07-16 0 2 3 2
13 2000-07-17 2 2 4 3
14 2000-07-18 1 2 5 1

Using dplyr::lag to calculate days since first event

The first n values of lag() get a default value, because you don't have 'older' data. The default value is NA. Hence the NA in your results.

Furthermore, using lag will only yield the difference between consecutive events.

How to get time difference in days since last date (lag) in R using datatable?

Specify the units

library(data.table)
dt[order(diag_date),diff_prev_event := difftime(diag_date,
lag( diag_date), units = 'days'), by = c("person_id") ]

Then, we grouped by 'person_id' and 'diag_date' and change the values to the max if there are more than one row

dt[, diff_prev_event := if(.N > 1) max(diff_prev_event, 
na.rm = TRUE) else diff_prev_event, .(person_id, diag_date)]
> dt
person_id diag_date concept_id event diff_prev_event
<int> <Date> <int> <char> <difftime>
1: 1 2012-01-15 4265600 comorb 0 days
2: 1 2012-01-15 201820 comorb 0 days
3: 1 2012-03-15 4265600 comorb 60 days
4: 2 2012-03-15 201820 comorb NA days
5: 2 2012-06-22 201820 comorb 99 days
6: 2 2012-06-22 4265600 comorb 99 days

-output

data

dt <- structure(list(person_id = c(1L, 1L, 1L, 2L, 2L, 2L), diag_date = structure(c(15354, 
15354, 15414, 15414, 15513, 15513), class = "Date"), concept_id = c(4265600L,
201820L, 4265600L, 201820L, 201820L, 4265600L), event = c("comorb",
"comorb", "comorb", "comorb", "comorb", "comorb")), row.names = c(NA,
-6L), class = c("data.table", "data.frame"))

In R, is there a way to calculate the number of days between the end of one event and the beginning of another?

Based on the data you're expecting, it seems like you're needing to make use of the group_by() function. This should get you what you're looking for.

# t*r*ibble, for creating data by row
hotelData <- tibble::tribble(
~custID, ~stayID, ~stayDt, ~checkInDt, ~checkOutDt,
"AAAAA", 11111, "01/15/1995", "01/10/1995", "01/17/1995",
"BBBBB", 11112, "02/08/1995", "02/02/1995", "02/25/1995",
"AAAAA", 11113, "03/01/1995", "03/01/1995", "03/03/1995",
"AAAAA", 11114, "06/24/1995", "06/22/1995", "07/02/1995",
"BBBBB", 11115, "10/02/1995", "10/01/1995", "10/10/1995",
"CCCCC", 11116, "01/08/1996", "01/05/1996", "01/17/1996",
"AAAAA", 11117, "05/15/1996", "05/10/1996", "05/28/1996"
)

# convert the date columns to the proper data type
# then, sort the data by customer ID and stayID
hotelData <- hotelData %>%
mutate(across(stayDt:checkOutDt, lubridate::mdy)) %>%
arrange(custID, stayID)

# within each customer, take the difference in days
hotelData %>%
group_by(custID) %>%
mutate(daysSinceLastStay = as.numeric(checkInDt - lag(checkOutDt)))

# A tibble: 7 x 6
# Groups: custID [3]
custID stayID stayDt checkInDt checkOutDt daysSinceLastStay
<chr> <dbl> <date> <date> <date> <dbl>
1 AAAAA 11111 1995-01-15 1995-01-10 1995-01-17 NA
2 AAAAA 11113 1995-03-01 1995-03-01 1995-03-03 43
3 AAAAA 11114 1995-06-24 1995-06-22 1995-07-02 111
4 AAAAA 11117 1996-05-15 1996-05-10 1996-05-28 313
5 BBBBB 11112 1995-02-08 1995-02-02 1995-02-25 NA
6 BBBBB 11115 1995-10-02 1995-10-01 1995-10-10 218
7 CCCCC 11116 1996-01-08 1996-01-05 1996-01-17 NA

Calculate the number of days since the last purchase per user ID in R

We can group by 'USERID' and get the difftime of the current and past 'Datetime' converted 'date' column

library(lubridate)
library(dplyr)
df1 %>%
mutate(date = mdy_hm(date)) %>% # convert to Datetime class
group_by(USERID) %>% #group by USERID
mutate(numberofdays = as.integer(difftime(date, # take the difference
lag(date, default = first(date)), unit = 'day')))
# A tibble: 8 x 5
# Groups: USERID [3]
# ID date USERID SALES numberofdays
# <int> <dttm> <dbl> <dbl> <int>
#1 1 2018-11-19 10:36:00 500 1000 0
#2 2 2018-11-19 10:41:00 520 1450 0
#3 3 2018-11-23 10:59:00 500 1390 4
#4 4 2018-11-23 11:12:00 530 1778 0
#5 5 2018-11-29 11:52:00 530 1966 6
#6 6 2018-12-05 12:23:00 520 1100 16
#7 7 2018-12-19 12:24:00 520 700 14
#8 8 2018-12-25 21:24:00 520 900 6

data

df1 <- structure(list(ID = 1:8, date = c("11/19/2018 10:36", "11/19/2018 10:41", 
"11/23/2018 10:59", "11/23/2018 11:12", "11/29/2018 11:52", "12/5/2018 12:23",
"12/19/2018 12:24", "12/25/2018 21:24"), USERID = c(500, 520,
500, 530, 530, 520, 520, 520), SALES = c(1000, 1450, 1390, 1778,
1966, 1100, 700, 900)), class = "data.frame", row.names = c(NA,
-8L))


Related Topics



Leave a reply



Submit