Using Group_By with Difftime

Using group_by with difftime

I did some minor editing to your code but basically you need to associate the results of ymd_hms with your mydata:

mydata$diffTime <- c(0, difftime(lubridate::ymd_hms(mydata$timestamp[-1]), 
                          lubridate::ymd_hms(mydata$timestamp[-nrow(mydata)]), units="hours"))
diffTime <- mydata %>% group_by(id) %>% summarize(mean(diffTime))

Returns:

R> diffTime
# A tibble: 2 x 2
     id `mean(diffTime)`
  <chr>            <dbl>
1     1         0.008333
2     2         0.033333

R groupby function in calculating difference between time

We could use difftime after converting to Date time using as.POSIXct (according to the OP's post, could have used strptime to convert it to POSIXlt class which is not compatible with tidyverse)

library(dplyr)
data %>%
   mutate(timeN = as.POSIXct(time, format = '%H:%M:%S')) %>%
   group_by(ID) %>%
   summarise(timediff = difftime(first(timeN), last(timeN), unit = 'sec'))
# A tibble: 2 x 2
#     ID timediff
#  <dbl> <time>  
#1     1 -2      
#2     2 -70

data

data <- structure(list(ID = c(1, 1, 2, 2), time = c("13:44:07", "13:44:09", 
"13:44:34", "13:45:44")), .Names = c("ID", "time"), row.names = c(NA, 
-4L), class = "data.frame")

how to calculate time difference between dates by group

if we need to group by 'location'

library(dplyr)
df %>%
    group_by(location) %>%
    mutate(time.diff.mins = difftime(date.time, lag(date.time), unit = 'min'))

Group by using 2 columns and if the time difference between the rows of third column is less than 2 seconds python pandas

In sample data is difference 5 seconds in last group (13:27:59 - 13:27:54 = 5seconds).

Solution is add DURATION in seconds to new column add and per groups get differencies by DataFrameGroupBy.diff with compare invert condition for greater like N with cumulative sum for new group column, last aggregate first and sum:

N = 5
dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
f = lambda x: x.diff().dt.total_seconds().gt(N).cumsum()
dataframe['g'] =  dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].transform(f)
print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION                 add  g
0       190       390 2020-04-20 12:44:36       323 2020-04-20 12:49:59  0
1       282       811 2020-04-06 11:12:24        25 2020-04-06 11:12:49  0
2       495       414 2020-04-20 11:22:13        11 2020-04-20 11:22:24  0
3       827       158 2020-04-30 13:27:32        22 2020-04-30 13:27:54  0
4       827       158 2020-04-30 13:27:44        15 2020-04-30 13:27:59  0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
                      .agg({'DATE_TIME':'first', 'DURATION':'sum'})
                      .droplevel(-1)
                      .reset_index())

print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION
0       190       390 2020-04-20 12:44:36       323
1       282       811 2020-04-06 11:12:24        25
2       495       414 2020-04-20 11:22:13        11
3       827       158 2020-04-30 13:27:32        37

If need compare add shifted per groups by DATE_TIME column solution (with new data) is:

N = 2

dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
dataframe['diff'] = dataframe['DATE_TIME'].sub(dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].shift()).dt.total_seconds().gt(N)

dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['diff'].cumsum()
print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION                 add  \
0       190       390 2020-04-20 12:44:36       323 2020-04-20 12:49:59   
1       282       811 2020-04-06 11:12:24        25 2020-04-06 11:12:49   
2       495       414 2020-04-20 11:22:13        11 2020-04-20 11:22:24   
3       827       158 2020-04-30 13:27:22        22 2020-04-30 13:27:44   
4       827       158 2020-04-30 13:27:44        15 2020-04-30 13:27:59   

    diff  g  
0  False  0  
1  False  0  
2  False  0  
3  False  0  
4  False  0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
                      .agg({'DATE_TIME':'first', 'DURATION':'sum'})
                      .droplevel(-1)
                      .reset_index())

print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION
0       190       390 2020-04-20 12:44:36       323
1       282       811 2020-04-06 11:12:24        25
2       495       414 2020-04-20 11:22:13        11
3       827       158 2020-04-30 13:27:22        37

Tested 3rd sample:

N = 2

dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
dataframe['diff'] = dataframe['DATE_TIME'].sub(dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].shift()).dt.total_seconds().gt(N)

dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['diff'].cumsum()
print (dataframe)
       A_PERSON    B_PERSON           DATE_TIME  DURATION                 add  \
0  441785807190  4299330390 2020-04-20 12:44:36       323 2020-04-20 12:49:59   
1   96897940827  3139578158 2020-04-30 13:27:00        33 2020-04-30 13:27:33   
2  441785808282  4238900811 2020-04-06 11:12:24        25 2020-04-06 11:12:49   
3   14244012495  3104405414 2020-04-20 11:22:13        11 2020-04-20 11:22:24   
4   96897940827  3139578158 2020-04-30 13:27:34        16 2020-04-30 13:27:50   

    diff  g  
0  False  0  
1  False  0  
2  False  0  
3  False  0  
4  False  0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
                      .agg({'DATE_TIME':'first', 'DURATION':'sum'})
                      .droplevel(-1)
                      .reset_index())

print (dataframe)
       A_PERSON    B_PERSON           DATE_TIME  DURATION
0   14244012495  3104405414 2020-04-20 11:22:13        11
1   96897940827  3139578158 2020-04-30 13:27:00        49
2  441785807190  4299330390 2020-04-20 12:44:36       323
3  441785808282  4238900811 2020-04-06 11:12:24        25

Group by id and store time difference(intervals) into a list

Use summarise to store the data in a list.

library(dplyr)

d %>% 
  group_by(ID) %>% 
  summarise(Time_interval = list(as.numeric(na.omit(round(difftime(Time, 
                                 lag(Time), units = 'mins')))))) -> result

result
# A tibble: 2 x 2
#     ID Time_interval
#  <int> <list>       
#1     1 <dbl [3]>    
#2     2 <dbl [1]>    

result$Time_interval

#[[1]]
#[1]  2  3 80

#[[2]]
#[1] 6

data

d <- structure(list(ID = c(1L, 2L, 1L, 1L, 2L, 1L), Time = structure(c(1581266398, 
1582134325, 1581266545, 1581266734, 1582134665, 1581271525), class = c("POSIXct", 
"POSIXt"), tzone = "UTC")), row.names = c(NA, -6L), class = "data.frame")

R grouping based on time difference

I think you could create the groups without have to do anything particularly fancy.

First we clean col_1 then get the groups. Note I create lag_time_diff to help with readability but you can choose to put it directly in the cumsum if you want.

df$col_1 <- as.POSIXct(df$col_1, format = "%m/%d/%Y")

lag_time_diff <- difftime(df$col_1, lag(df$col_1, default = df$col_1[1]), units = "days")
df$group <- cumsum(ifelse(lag_time_diff>5,1,0))

df
#       col_1 col_2 group
#1 2007-11-13     A     0
#2 2007-11-17     B     0
#3 2007-11-19     C     0
#4 2007-11-25     D     1
#5 2007-11-28     E     1

All this does is check if the lagged difference in times is >5, if it is it indexes by 1 otherwise it keeps the same value.

Use difftime to group by interval

There is a hms::round_hms() function that seems to work:

> test %>% mutate(hms::round_hms(ts, 5))

# A tibble: 10 x 3
   ts        val `hms::round_hms(ts, 5)`
   <time>  <dbl> <time>                 
 1 00'00" -0.7   00'00"                 
 2 00'01" -1.69  00'00"                 
 3 00'02"  0.03  00'00"                 
 4 00'03"  0.570 00'05"                 
 5 00'04" -0.15  00'05"                 
 6 00'05" -0.34  00'05"                 
 7 00'06" -0.45  00'05"                 
 8 00'07"  0.77  00'05"                 
 9 00'08"  0.6   00'10"                 
10 00'09"  0.01  00'10"

If you want to floor it, I think you need a custom function, but the source for round_hms() gives a good template for accomplishing that: https://github.com/tidyverse/hms/blob/master/R/round.R

And, here it is:

floor_hms <- function(x, secs) {
  vctrs::vec_restore(floor(as.numeric(x) / secs) * secs, x)
}

example:

> test %>% mutate(hms::round_hms(ts, 5), floor_hms(ts, 5))

# A tibble: 10 x 4
   ts        val `hms::round_hms(ts, 5)` `floor_hms(ts, 5)`
   <time>  <dbl> <time>                  <time>            
 1 00'00" -0.7   00'00"                  00'00"            
 2 00'01" -1.69  00'00"                  00'00"            
 3 00'02"  0.03  00'00"                  00'00"            
 4 00'03"  0.570 00'05"                  00'00"            
 5 00'04" -0.15  00'05"                  00'00"            
 6 00'05" -0.34  00'05"                  00'05"            
 7 00'06" -0.45  00'05"                  00'05"            
 8 00'07"  0.77  00'05"                  00'05"            
 9 00'08"  0.6   00'10"                  00'05"            
10 00'09"  0.01  00'10"                  00'05"

How to group data by time difference in r?

You can use lag to get previous value and use it in difftime to get difference in hours and increment the group value everytime the difference is greater than 3 hours.

library(dplyr)
df %>%
        mutate(group = cumsum(difftime(time, lag(time, default = first(time)), 
                              units = "hours") > 3) + 1)

#                  time    type day month          gas response group
#1  2018-08-21 14:30:00 old_std  21     8 benzene_area 10260721     1
#2  2018-08-21 15:30:00 old_std  21     8 benzene_area  2591905     1
#3  2018-11-09 20:00:00 old_std   9    11 benzene_area   684166     2
#4  2018-11-09 23:00:00 old_std   9    11 benzene_area   589498     2
#5  2018-11-10 02:00:00 old_std  10    11 benzene_area   460714     2
#6  2018-11-10 05:00:00 old_std  10    11 benzene_area   490663     2
#7  2018-11-10 11:30:00 old_std  10    11 benzene_area   404947     3
#8  2018-11-10 14:30:00 old_std  10    11 benzene_area   402566     3
#9  2018-11-10 16:30:00 old_std  10    11 benzene_area   362376     3
#10 2018-11-11 00:00:00 old_std  11    11 benzene_area   276184     4

In base R, that would be :

df$group <- cumsum(c(TRUE, difftime(df$time[-1], df$time[-nrow(df)],
                    units = "hours") > 3))

Calculate difftime only if a column value matches in R

Don't use $ when using dplyr pipes. Also it is always safer to be explicit about the units in difftime.

library(dplyr)

dt <- dt %>% 
  group_by(serviceID) %>%
  mutate(diff= as.numeric(difftime(datetime, lag(datetime), units = 'secs'))) %>%
  ungroup

If you want to do this in data.table -

library(data.table)
setDT(dt)

dt[, diff := as.numeric(difftime(datetime, shift(datetime), units = 'secs')), serviceID]

Using Group_By with Difftime