Calculate days since last event in R
You could try something like this:
# make an index of the latest events
last_event_index <- cumsum(df$event) + 1
# shift it by one to the right
last_event_index <- c(1, last_event_index[1:length(last_event_index) - 1])
# get the dates of the events and index the vector with the last_event_index,
# added an NA as the first date because there was no event
last_event_date <- c(as.Date(NA), df[which(df$event==1), "date"])[last_event_index]
# substract the event's date with the date of the last event
df$tae <- df$date - last_event_date
df
# date event tae
#1 2000-07-06 0 NA days
#2 2000-09-15 0 NA days
#3 2000-10-15 1 NA days
#4 2001-01-03 0 80 days
#5 2001-03-17 1 153 days
#6 2001-05-23 1 67 days
#7 2001-08-26 0 95 days
R: Days since last event per ID
df <- data.table(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001","06/08/2010","15/09/2010","15/10/2010","03/01/2011","17/03/2011"),
"%d/%m/%Y"), event=c(0,0,1,0,1, 1,0,1,0,1),id = c(rep(1,5),rep(2,5)))
tempdt <- df[event==1,]
tempdt[,tae := date - shift(date), by = id]
df <- merge(df, tempdt, by = c("date", "event", "id"), all.x = TRUE)
df[, tae := ifelse(shift(event)==1, date - shift(date), tae), by = id]
EDIT
More general solution
df <- data.table(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001", "18/03/2001",
"06/08/2010","15/09/2010","15/10/2010","03/01/2011","17/03/2011","19/03/2011"),
"%d/%m/%Y"),
event=c(1,0,0,0,0,0,1,1,1,0,1,0),id = c(rep(1,6),rep(5,6)))
##for event = 1 observations
tempdt <- df[event==1,]
tempdt[,tae := date - shift(date), by = id]
df <- merge(df, tempdt, by = c("date", "event", "id"), all.x = TRUE)
##for event = 0 observations
for(d in df[event==0, date]){
# print(as.Date(d, origin = "1970-01-01"))
df[date == d & event == 0, tae := as.Date(d, origin = "1970-01-01") -
max(df[date<d & event==1,date]), by = id]
}
EDIT 2
Now, there must be a faster way to do this, but if first observation is event = 0
, this won't result in any warning
df <- data.table(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001","06/08/2010","15/09/2010","15/10/2010","03/01/2011","17/03/2011"),
"%d/%m/%Y"), event=c(0,0,1,0,1, 1,0,0,0,1),id = c(rep(1,5),rep(2,5)))
tempdt <- df[event==1,]
tempdt[,tae := date - shift(date), by = id]
df <- merge(df, tempdt, by = c("date", "event", "id"), all.x = TRUE)
for(i in unique(df[,id])){
# print(i)
for(d in df[date>df[id == i & event==1,min(date)] & event==0, date]){
# print(as.Date(d, origin = "1970-01-01"))
df[id == i & date == d & event == 0,
tae := as.Date(d, origin = "1970-01-01") - max(df[date<d &
event==1,date])]
}
}
Calculate number of days passed since previous date within each group
Try the following. Small changes to your code make a difference.
suppressPackageStartupMessages(library(dplyr))
data %>%
mutate(date = as.Date(date)) %>%
group_by(group)%>%
arrange(group, date) %>%
mutate(Difference = difftime(date, lag(date), units = "days"))
#> # A tibble: 10 × 4
#> # Groups: group [2]
#> year group date Difference
#> <int> <chr> <date> <drtn>
#> 1 2019 A 2019-07-15 NA days
#> 2 2019 A 2019-07-25 10 days
#> 3 2019 A 2019-08-01 7 days
#> 4 2020 A 2020-08-01 366 days
#> 5 2020 A 2020-08-03 2 days
#> 6 2019 B 2019-07-15 NA days
#> 7 2019 B 2019-07-30 15 days
#> 8 2020 B 2020-08-01 368 days
#> 9 2020 B 2020-08-20 19 days
#> 10 2020 B 2020-08-25 5 days
Created on 2022-06-10 by the reprex package (v2.0.1)
R - Calculate Time Elapsed Since Last Event with Multiple Event Types
The base R version of this is to use split/lapply/rbind to generate the new column.
> do.call(rbind,
lapply(
split(df, df$event_type),
function(d) {
d$dsle <- c(NA, diff(d$date)); d
}
)
)
date event_type dsle
0.1 2000-07-06 0 NA
0.7 2001-05-26 0 324
1.3 2000-10-15 1 NA
1.6 2001-04-23 1 190
2.4 2001-01-03 2 NA
2.8 2001-06-01 2 149
3.9 2001-06-30 3 NA
3.10 2001-07-02 3 2
3.12 2001-12-21 3 172
4.2 2000-09-15 4 NA
4.5 2001-03-17 4 183
4.11 2001-07-15 4 120
Note that this returns the data in a different order than provided; you can re-sort by date or save the original indices if you want to preserve that order.
Above, @akrun has posted the data.tables
approach, the parallel dplyr
approach would be straightforward as well:
library(dplyr)
df %>% group_by(event_type) %>% mutate(days_since_last_event=date - lag(date, 1))
Source: local data frame [12 x 3]
Groups: event_type [5]
date event_type days_since_last_event
(date) (dbl) (dfft)
1 2000-07-06 0 NA days
2 2000-09-15 4 NA days
3 2000-10-15 1 NA days
4 2001-01-03 2 NA days
5 2001-03-17 4 183 days
6 2001-04-23 1 190 days
7 2001-05-26 0 324 days
8 2001-06-01 2 149 days
9 2001-06-30 3 NA days
10 2001-07-02 3 2 days
11 2001-07-15 4 120 days
12 2001-12-21 3 172 days
R - Calculate Time Elapsed Since Last Events with Multiple Event Types and IDs
The following uses the Chron
Library to calculate difference in the dates
library(chron)
df$date <- chron(as.character(df$date),format=c(date="y-m-d"))
for(j in unique(df$id)){
DaysSince1 <-NA
DaysSince2 <-NA
RowsWithID <- grep(j,df$id)
for(i in RowsWithID){
df$days_since_event_1[i] <- df$date[i]-df$date[i-DaysSince1]
df$days_since_event_2[i] <- df$date[i]-df$date[i-DaysSince2]
if(df$event[i]==1){DaysSince1<-1}
else{DaysSince1<-DaysSince1+1}
if(df$event[i]==2){DaysSince2<-1}
else{DaysSince2<-DaysSince2+1}
}
}
This code gives the following results
> df
date event id days_since_event_1 days_since_event_2
1 00-07-06 2 1 NA NA
2 00-07-07 1 1 NA 1
3 00-07-09 0 1 2 3
4 00-07-10 0 1 3 4
5 00-07-15 2 1 8 9
6 00-07-16 1 1 9 1
7 00-07-20 0 1 4 5
8 00-07-21 1 1 5 6
9 00-07-06 1 2 NA NA
10 00-07-07 2 2 1 NA
11 00-07-15 0 2 9 8
12 00-07-16 0 2 10 9
13 00-07-17 2 2 11 10
14 00-07-18 1 2 12 1
To address you comment, you can do the following in Base R
to get the number of observations rather than days. No Libraries needed.
for(j in unique(df$id)){
ObsSince1 <-NA
ObsSince2 <-NA
RowsWithID <- grep(j,df$id)
for(i in RowsWithID){
df$Obs_since_event_1[i] <- ObsSince1
df$Obs_since_event_2[i] <- ObsSince2
if(df$event[i]==1){ObsSince1<-1}
else{ObsSince1<-ObsSince1+1}
if(df$event[i]==2){ObsSince2<-1}
else{ObsSince2<-ObsSince2+1}
}
}
You should get the following output
> df
date event id Obs_since_event_1 Obs_since_event_2
1 2000-07-06 2 1 NA NA
2 2000-07-07 1 1 NA 1
3 2000-07-09 0 1 1 2
4 2000-07-10 0 1 2 3
5 2000-07-15 2 1 3 4
6 2000-07-16 1 1 4 1
7 2000-07-20 0 1 1 2
8 2000-07-21 1 1 2 3
9 2000-07-06 1 2 NA NA
10 2000-07-07 2 2 1 NA
11 2000-07-15 0 2 2 1
12 2000-07-16 0 2 3 2
13 2000-07-17 2 2 4 3
14 2000-07-18 1 2 5 1
Using dplyr::lag to calculate days since first event
The first n values of lag()
get a default value, because you don't have 'older' data. The default value is NA
. Hence the NA
in your results.
Furthermore, using lag will only yield the difference between consecutive events.
How to get time difference in days since last date (lag) in R using datatable?
Specify the units
library(data.table)
dt[order(diag_date),diff_prev_event := difftime(diag_date,
lag( diag_date), units = 'days'), by = c("person_id") ]
Then, we grouped by 'person_id' and 'diag_date' and change the values to the max
if there are more than one row
dt[, diff_prev_event := if(.N > 1) max(diff_prev_event,
na.rm = TRUE) else diff_prev_event, .(person_id, diag_date)]
> dt
person_id diag_date concept_id event diff_prev_event
<int> <Date> <int> <char> <difftime>
1: 1 2012-01-15 4265600 comorb 0 days
2: 1 2012-01-15 201820 comorb 0 days
3: 1 2012-03-15 4265600 comorb 60 days
4: 2 2012-03-15 201820 comorb NA days
5: 2 2012-06-22 201820 comorb 99 days
6: 2 2012-06-22 4265600 comorb 99 days
-output
data
dt <- structure(list(person_id = c(1L, 1L, 1L, 2L, 2L, 2L), diag_date = structure(c(15354,
15354, 15414, 15414, 15513, 15513), class = "Date"), concept_id = c(4265600L,
201820L, 4265600L, 201820L, 201820L, 4265600L), event = c("comorb",
"comorb", "comorb", "comorb", "comorb", "comorb")), row.names = c(NA,
-6L), class = c("data.table", "data.frame"))
In R, is there a way to calculate the number of days between the end of one event and the beginning of another?
Based on the data you're expecting, it seems like you're needing to make use of the group_by()
function. This should get you what you're looking for.
# t*r*ibble, for creating data by row
hotelData <- tibble::tribble(
~custID, ~stayID, ~stayDt, ~checkInDt, ~checkOutDt,
"AAAAA", 11111, "01/15/1995", "01/10/1995", "01/17/1995",
"BBBBB", 11112, "02/08/1995", "02/02/1995", "02/25/1995",
"AAAAA", 11113, "03/01/1995", "03/01/1995", "03/03/1995",
"AAAAA", 11114, "06/24/1995", "06/22/1995", "07/02/1995",
"BBBBB", 11115, "10/02/1995", "10/01/1995", "10/10/1995",
"CCCCC", 11116, "01/08/1996", "01/05/1996", "01/17/1996",
"AAAAA", 11117, "05/15/1996", "05/10/1996", "05/28/1996"
)
# convert the date columns to the proper data type
# then, sort the data by customer ID and stayID
hotelData <- hotelData %>%
mutate(across(stayDt:checkOutDt, lubridate::mdy)) %>%
arrange(custID, stayID)
# within each customer, take the difference in days
hotelData %>%
group_by(custID) %>%
mutate(daysSinceLastStay = as.numeric(checkInDt - lag(checkOutDt)))
# A tibble: 7 x 6
# Groups: custID [3]
custID stayID stayDt checkInDt checkOutDt daysSinceLastStay
<chr> <dbl> <date> <date> <date> <dbl>
1 AAAAA 11111 1995-01-15 1995-01-10 1995-01-17 NA
2 AAAAA 11113 1995-03-01 1995-03-01 1995-03-03 43
3 AAAAA 11114 1995-06-24 1995-06-22 1995-07-02 111
4 AAAAA 11117 1996-05-15 1996-05-10 1996-05-28 313
5 BBBBB 11112 1995-02-08 1995-02-02 1995-02-25 NA
6 BBBBB 11115 1995-10-02 1995-10-01 1995-10-10 218
7 CCCCC 11116 1996-01-08 1996-01-05 1996-01-17 NA
Calculate the number of days since the last purchase per user ID in R
We can group by 'USERID' and get the difftime
of the current and past 'Datetime' converted 'date' column
library(lubridate)
library(dplyr)
df1 %>%
mutate(date = mdy_hm(date)) %>% # convert to Datetime class
group_by(USERID) %>% #group by USERID
mutate(numberofdays = as.integer(difftime(date, # take the difference
lag(date, default = first(date)), unit = 'day')))
# A tibble: 8 x 5
# Groups: USERID [3]
# ID date USERID SALES numberofdays
# <int> <dttm> <dbl> <dbl> <int>
#1 1 2018-11-19 10:36:00 500 1000 0
#2 2 2018-11-19 10:41:00 520 1450 0
#3 3 2018-11-23 10:59:00 500 1390 4
#4 4 2018-11-23 11:12:00 530 1778 0
#5 5 2018-11-29 11:52:00 530 1966 6
#6 6 2018-12-05 12:23:00 520 1100 16
#7 7 2018-12-19 12:24:00 520 700 14
#8 8 2018-12-25 21:24:00 520 900 6
data
df1 <- structure(list(ID = 1:8, date = c("11/19/2018 10:36", "11/19/2018 10:41",
"11/23/2018 10:59", "11/23/2018 11:12", "11/29/2018 11:52", "12/5/2018 12:23",
"12/19/2018 12:24", "12/25/2018 21:24"), USERID = c(500, 520,
500, 530, 530, 520, 520, 520), SALES = c(1000, 1450, 1390, 1778,
1966, 1100, 700, 900)), class = "data.frame", row.names = c(NA,
-8L))
Related Topics
Locator Equivalent in Ggplot2 (For Maps)
Alignment of Numbers on the Individual Bars with Ggplot2
Rcurl: Http Authentication When Site Responds with Http 401 Code Without Www-Authenticate
Drawing Non-Intersecting Circles
Flip Facet Label and X Axis with Ggplot2
Unique.Data.Table Select Last Row in Place of the First
How to Reverse Legend (Labels and Color) So High Value Starts at Bottom
Iteratively Constructed Dataframe in R
Ggplot Geom_Bar: Stack and Center
R: Saving Ggplot2 Plots in a List
How to Output a Stem and Leaf Plot as a Plot
R: Ggplot2 Make Two Geom_Tile Plots Have Equal Height