Using Group_By with Difftime

Using group_by with difftime

I did some minor editing to your code but basically you need to associate the results of ymd_hms with your mydata:

mydata$diffTime <- c(0, difftime(lubridate::ymd_hms(mydata$timestamp[-1]), 
lubridate::ymd_hms(mydata$timestamp[-nrow(mydata)]), units="hours"))
diffTime <- mydata %>% group_by(id) %>% summarize(mean(diffTime))

Returns:

R> diffTime
# A tibble: 2 x 2
id `mean(diffTime)`
<chr> <dbl>
1 1 0.008333
2 2 0.033333

R groupby function in calculating difference between time

We could use difftime after converting to Date time using as.POSIXct (according to the OP's post, could have used strptime to convert it to POSIXlt class which is not compatible with tidyverse)

library(dplyr)
data %>%
mutate(timeN = as.POSIXct(time, format = '%H:%M:%S')) %>%
group_by(ID) %>%
summarise(timediff = difftime(first(timeN), last(timeN), unit = 'sec'))
# A tibble: 2 x 2
# ID timediff
# <dbl> <time>
#1 1 -2
#2 2 -70

data

data <- structure(list(ID = c(1, 1, 2, 2), time = c("13:44:07", "13:44:09", 
"13:44:34", "13:45:44")), .Names = c("ID", "time"), row.names = c(NA,
-4L), class = "data.frame")

how to calculate time difference between dates by group

if we need to group by 'location'

library(dplyr)
df %>%
group_by(location) %>%
mutate(time.diff.mins = difftime(date.time, lag(date.time), unit = 'min'))

Group by using 2 columns and if the time difference between the rows of third column is less than 2 seconds python pandas

In sample data is difference 5 seconds in last group (13:27:59 - 13:27:54 = 5seconds).

Solution is add DURATION in seconds to new column add and per groups get differencies by DataFrameGroupBy.diff with compare invert condition for greater like N with cumulative sum for new group column, last aggregate first and sum:

N = 5
dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
f = lambda x: x.diff().dt.total_seconds().gt(N).cumsum()
dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].transform(f)
print (dataframe)
A_PERSON B_PERSON DATE_TIME DURATION add g
0 190 390 2020-04-20 12:44:36 323 2020-04-20 12:49:59 0
1 282 811 2020-04-06 11:12:24 25 2020-04-06 11:12:49 0
2 495 414 2020-04-20 11:22:13 11 2020-04-20 11:22:24 0
3 827 158 2020-04-30 13:27:32 22 2020-04-30 13:27:54 0
4 827 158 2020-04-30 13:27:44 15 2020-04-30 13:27:59 0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
.agg({'DATE_TIME':'first', 'DURATION':'sum'})
.droplevel(-1)
.reset_index())

print (dataframe)
A_PERSON B_PERSON DATE_TIME DURATION
0 190 390 2020-04-20 12:44:36 323
1 282 811 2020-04-06 11:12:24 25
2 495 414 2020-04-20 11:22:13 11
3 827 158 2020-04-30 13:27:32 37

If need compare add shifted per groups by DATE_TIME column solution (with new data) is:

N = 2

dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
dataframe['diff'] = dataframe['DATE_TIME'].sub(dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].shift()).dt.total_seconds().gt(N)

dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['diff'].cumsum()
print (dataframe)
A_PERSON B_PERSON DATE_TIME DURATION add \
0 190 390 2020-04-20 12:44:36 323 2020-04-20 12:49:59
1 282 811 2020-04-06 11:12:24 25 2020-04-06 11:12:49
2 495 414 2020-04-20 11:22:13 11 2020-04-20 11:22:24
3 827 158 2020-04-30 13:27:22 22 2020-04-30 13:27:44
4 827 158 2020-04-30 13:27:44 15 2020-04-30 13:27:59

diff g
0 False 0
1 False 0
2 False 0
3 False 0
4 False 0


dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
.agg({'DATE_TIME':'first', 'DURATION':'sum'})
.droplevel(-1)
.reset_index())

print (dataframe)
A_PERSON B_PERSON DATE_TIME DURATION
0 190 390 2020-04-20 12:44:36 323
1 282 811 2020-04-06 11:12:24 25
2 495 414 2020-04-20 11:22:13 11
3 827 158 2020-04-30 13:27:22 37

Tested 3rd sample:

N = 2

dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
dataframe['diff'] = dataframe['DATE_TIME'].sub(dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].shift()).dt.total_seconds().gt(N)

dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['diff'].cumsum()
print (dataframe)
A_PERSON B_PERSON DATE_TIME DURATION add \
0 441785807190 4299330390 2020-04-20 12:44:36 323 2020-04-20 12:49:59
1 96897940827 3139578158 2020-04-30 13:27:00 33 2020-04-30 13:27:33
2 441785808282 4238900811 2020-04-06 11:12:24 25 2020-04-06 11:12:49
3 14244012495 3104405414 2020-04-20 11:22:13 11 2020-04-20 11:22:24
4 96897940827 3139578158 2020-04-30 13:27:34 16 2020-04-30 13:27:50

diff g
0 False 0
1 False 0
2 False 0
3 False 0
4 False 0


dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
.agg({'DATE_TIME':'first', 'DURATION':'sum'})
.droplevel(-1)
.reset_index())

print (dataframe)
A_PERSON B_PERSON DATE_TIME DURATION
0 14244012495 3104405414 2020-04-20 11:22:13 11
1 96897940827 3139578158 2020-04-30 13:27:00 49
2 441785807190 4299330390 2020-04-20 12:44:36 323
3 441785808282 4238900811 2020-04-06 11:12:24 25

Group by id and store time difference(intervals) into a list

Use summarise to store the data in a list.

library(dplyr)

d %>%
group_by(ID) %>%
summarise(Time_interval = list(as.numeric(na.omit(round(difftime(Time,
lag(Time), units = 'mins')))))) -> result

result
# A tibble: 2 x 2
# ID Time_interval
# <int> <list>
#1 1 <dbl [3]>
#2 2 <dbl [1]>

result$Time_interval

#[[1]]
#[1] 2 3 80

#[[2]]
#[1] 6

data

d <- structure(list(ID = c(1L, 2L, 1L, 1L, 2L, 1L), Time = structure(c(1581266398, 
1582134325, 1581266545, 1581266734, 1582134665, 1581271525), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -6L), class = "data.frame")

R grouping based on time difference

I think you could create the groups without have to do anything particularly fancy.

First we clean col_1 then get the groups. Note I create lag_time_diff to help with readability but you can choose to put it directly in the cumsum if you want.

df$col_1 <- as.POSIXct(df$col_1, format = "%m/%d/%Y")

lag_time_diff <- difftime(df$col_1, lag(df$col_1, default = df$col_1[1]), units = "days")
df$group <- cumsum(ifelse(lag_time_diff>5,1,0))

df
# col_1 col_2 group
#1 2007-11-13 A 0
#2 2007-11-17 B 0
#3 2007-11-19 C 0
#4 2007-11-25 D 1
#5 2007-11-28 E 1

All this does is check if the lagged difference in times is >5, if it is it indexes by 1 otherwise it keeps the same value.

Use difftime to group by interval

There is a hms::round_hms() function that seems to work:

> test %>% mutate(hms::round_hms(ts, 5))

# A tibble: 10 x 3
ts val `hms::round_hms(ts, 5)`
<time> <dbl> <time>
1 00'00" -0.7 00'00"
2 00'01" -1.69 00'00"
3 00'02" 0.03 00'00"
4 00'03" 0.570 00'05"
5 00'04" -0.15 00'05"
6 00'05" -0.34 00'05"
7 00'06" -0.45 00'05"
8 00'07" 0.77 00'05"
9 00'08" 0.6 00'10"
10 00'09" 0.01 00'10"

If you want to floor it, I think you need a custom function, but the source for round_hms() gives a good template for accomplishing that: https://github.com/tidyverse/hms/blob/master/R/round.R

And, here it is:

floor_hms <- function(x, secs) {
vctrs::vec_restore(floor(as.numeric(x) / secs) * secs, x)
}

example:

> test %>% mutate(hms::round_hms(ts, 5), floor_hms(ts, 5))

# A tibble: 10 x 4
ts val `hms::round_hms(ts, 5)` `floor_hms(ts, 5)`
<time> <dbl> <time> <time>
1 00'00" -0.7 00'00" 00'00"
2 00'01" -1.69 00'00" 00'00"
3 00'02" 0.03 00'00" 00'00"
4 00'03" 0.570 00'05" 00'00"
5 00'04" -0.15 00'05" 00'00"
6 00'05" -0.34 00'05" 00'05"
7 00'06" -0.45 00'05" 00'05"
8 00'07" 0.77 00'05" 00'05"
9 00'08" 0.6 00'10" 00'05"
10 00'09" 0.01 00'10" 00'05"

How to group data by time difference in r?

You can use lag to get previous value and use it in difftime to get difference in hours and increment the group value everytime the difference is greater than 3 hours.

library(dplyr)
df %>%
mutate(group = cumsum(difftime(time, lag(time, default = first(time)),
units = "hours") > 3) + 1)

# time type day month gas response group
#1 2018-08-21 14:30:00 old_std 21 8 benzene_area 10260721 1
#2 2018-08-21 15:30:00 old_std 21 8 benzene_area 2591905 1
#3 2018-11-09 20:00:00 old_std 9 11 benzene_area 684166 2
#4 2018-11-09 23:00:00 old_std 9 11 benzene_area 589498 2
#5 2018-11-10 02:00:00 old_std 10 11 benzene_area 460714 2
#6 2018-11-10 05:00:00 old_std 10 11 benzene_area 490663 2
#7 2018-11-10 11:30:00 old_std 10 11 benzene_area 404947 3
#8 2018-11-10 14:30:00 old_std 10 11 benzene_area 402566 3
#9 2018-11-10 16:30:00 old_std 10 11 benzene_area 362376 3
#10 2018-11-11 00:00:00 old_std 11 11 benzene_area 276184 4

In base R, that would be :

df$group <- cumsum(c(TRUE, difftime(df$time[-1], df$time[-nrow(df)],
units = "hours") > 3))

Calculate difftime only if a column value matches in R

Don't use $ when using dplyr pipes. Also it is always safer to be explicit about the units in difftime.

library(dplyr)

dt <- dt %>%
group_by(serviceID) %>%
mutate(diff= as.numeric(difftime(datetime, lag(datetime), units = 'secs'))) %>%
ungroup

If you want to do this in data.table -

library(data.table)
setDT(dt)

dt[, diff := as.numeric(difftime(datetime, shift(datetime), units = 'secs')), serviceID]


Related Topics



Leave a reply



Submit