How to Calculate Time Difference with Previous Row of a Data.Frame by Group

Get average time difference between current and previous row

Assuming you're using MySQL 8 or later you can use lag function to find the previous datetime in order to calculate the difference. Rest is straight forward:

with cte as (
    select cast(ts as date) as dt
         , ts
         , lag(ts) over(partition by cast(ts as date) order by ts) as prev_ts
    from t
)
select dt
     , count(*) as count
     , avg(timestampdiff(second, prev_ts, ts)) as avg_diff
from cte
group by dt

python difference with previous row by group

Shifts through each group to create a calculated column. Subtract that column from the original value column to create the difference column.

df.sort_values(['group','date'], ascending=[True,True], inplace=True)
df['shift'] = df.groupby('group')['value'].shift()
df['diff'] = df['value'] - df['shift']
df = df[['date','group','value','diff']]

1
df
date    group   value   diff
0   2020-01-01  A   808 NaN
2   2020-01-02  A   612 -196.0
4   2020-01-03  A   234 -378.0
5   2020-01-04  A   828 594.0
6   2020-01-05  A   1075    247.0
10  2020-01-10  A   1436    361.0
1   2020-01-01  B   331 NaN
3   2020-01-02  B   1391    1060.0
6   2020-01-04  B   820 -571.0
8   2020-01-07  B   572 -248.0
9   2020-01-10  B   736 164.0

Group by using 2 columns and if the time difference between the rows of third column is less than 2 seconds python pandas

In sample data is difference 5 seconds in last group (13:27:59 - 13:27:54 = 5seconds).

Solution is add DURATION in seconds to new column add and per groups get differencies by DataFrameGroupBy.diff with compare invert condition for greater like N with cumulative sum for new group column, last aggregate first and sum:

N = 5
dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
f = lambda x: x.diff().dt.total_seconds().gt(N).cumsum()
dataframe['g'] =  dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].transform(f)
print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION                 add  g
0       190       390 2020-04-20 12:44:36       323 2020-04-20 12:49:59  0
1       282       811 2020-04-06 11:12:24        25 2020-04-06 11:12:49  0
2       495       414 2020-04-20 11:22:13        11 2020-04-20 11:22:24  0
3       827       158 2020-04-30 13:27:32        22 2020-04-30 13:27:54  0
4       827       158 2020-04-30 13:27:44        15 2020-04-30 13:27:59  0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
                      .agg({'DATE_TIME':'first', 'DURATION':'sum'})
                      .droplevel(-1)
                      .reset_index())

print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION
0       190       390 2020-04-20 12:44:36       323
1       282       811 2020-04-06 11:12:24        25
2       495       414 2020-04-20 11:22:13        11
3       827       158 2020-04-30 13:27:32        37

If need compare add shifted per groups by DATE_TIME column solution (with new data) is:

N = 2

dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
dataframe['diff'] = dataframe['DATE_TIME'].sub(dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].shift()).dt.total_seconds().gt(N)

dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['diff'].cumsum()
print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION                 add  \
0       190       390 2020-04-20 12:44:36       323 2020-04-20 12:49:59   
1       282       811 2020-04-06 11:12:24        25 2020-04-06 11:12:49   
2       495       414 2020-04-20 11:22:13        11 2020-04-20 11:22:24   
3       827       158 2020-04-30 13:27:22        22 2020-04-30 13:27:44   
4       827       158 2020-04-30 13:27:44        15 2020-04-30 13:27:59   

    diff  g  
0  False  0  
1  False  0  
2  False  0  
3  False  0  
4  False  0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
                      .agg({'DATE_TIME':'first', 'DURATION':'sum'})
                      .droplevel(-1)
                      .reset_index())

print (dataframe)
   A_PERSON  B_PERSON           DATE_TIME  DURATION
0       190       390 2020-04-20 12:44:36       323
1       282       811 2020-04-06 11:12:24        25
2       495       414 2020-04-20 11:22:13        11
3       827       158 2020-04-30 13:27:22        37

Tested 3rd sample:

N = 2

dataframe['DATE_TIME'] = pd.to_datetime(dataframe['DATE_TIME'])

dataframe['add'] = dataframe['DATE_TIME'] + pd.to_timedelta(dataframe['DURATION'],unit='s')
dataframe['diff'] = dataframe['DATE_TIME'].sub(dataframe.groupby(["A_PERSON", "B_PERSON"])['add'].shift()).dt.total_seconds().gt(N)

dataframe['g'] = dataframe.groupby(["A_PERSON", "B_PERSON"])['diff'].cumsum()
print (dataframe)
       A_PERSON    B_PERSON           DATE_TIME  DURATION                 add  \
0  441785807190  4299330390 2020-04-20 12:44:36       323 2020-04-20 12:49:59   
1   96897940827  3139578158 2020-04-30 13:27:00        33 2020-04-30 13:27:33   
2  441785808282  4238900811 2020-04-06 11:12:24        25 2020-04-06 11:12:49   
3   14244012495  3104405414 2020-04-20 11:22:13        11 2020-04-20 11:22:24   
4   96897940827  3139578158 2020-04-30 13:27:34        16 2020-04-30 13:27:50   

    diff  g  
0  False  0  
1  False  0  
2  False  0  
3  False  0  
4  False  0

dataframe = (dataframe.groupby(["A_PERSON", "B_PERSON", 'g'])
                      .agg({'DATE_TIME':'first', 'DURATION':'sum'})
                      .droplevel(-1)
                      .reset_index())

print (dataframe)
       A_PERSON    B_PERSON           DATE_TIME  DURATION
0   14244012495  3104405414 2020-04-20 11:22:13        11
1   96897940827  3139578158 2020-04-30 13:27:00        49
2  441785807190  4299330390 2020-04-20 12:44:36       323
3  441785808282  4238900811 2020-04-06 11:12:24        25

How to calculate difference on previous within set of grouped rows in a dataframe

[Note: your data doesn't seem to match your desired output; there are no CONTRACT_REF Cs in the second, and even in your output, I don't see why the 5, B row is 1 and not 0. I'm assuming that these are mistakes on your part. Since you didn't comment, I'm going to use the data from the output, because it leads to a more interesting column.]

I might do something like

df["SUBMISSION_DATE"] = pd.to_datetime(df["SUBMISSION_DATE"],dayfirst=True)

gs = df.groupby(["USER_ID", "CONTRACT_REF"])["SUBMISSION_DATE"]
df["TIME_DIFF"] = gs.diff().fillna(0) / pd.datetools.timedelta(hours=1)

which produces

>>> df
    #  USER_ID CONTRACT_REF     SUBMISSION_DATE  TIME_DIFF
0   1        1            A 2014-06-20 01:00:00        0.0
1   2        1            A 2014-06-20 02:00:00        1.0
2   3        1            B 2014-06-20 03:00:00        0.0
3   4        4            A 2014-06-20 04:00:00        0.0
4   5        5            A 2014-06-20 05:00:00        0.0
5   6        5            B 2014-06-20 06:00:00        0.0
6   7        7            A 2014-06-20 07:00:00        0.0
7   8        7            A 2014-06-20 08:00:00        1.0
8   9        7            A 2014-06-20 09:30:00        1.5
9  10        7            B 2014-06-20 10:00:00        0.0

[10 rows x 5 columns]

Some explanation: starting from a dataframe like

>>> df
    #  USER_ID CONTRACT_REF SUBMISSION_DATE
0   1        1            A      20/6 01:00
1   2        1            A      20/6 02:00
2   3        1            B      20/6 03:00
3   4        4            A      20/6 04:00
4   5        5            A      20/6 05:00
5   6        5            B      20/6 06:00
6   7        7            A      20/6 07:00
7   8        7            A      20/6 08:00
8   9        7            A      20/6 09:30
9  10        7            B      20/6 10:00

[10 rows x 4 columns]

We want to turn the SUBMISSION_DATE column from strings to real date objects:

>>> df["SUBMISSION_DATE"] = pd.to_datetime(df["SUBMISSION_DATE"],dayfirst=True)
>>> df
    #  USER_ID CONTRACT_REF     SUBMISSION_DATE
0   1        1            A 2014-06-20 01:00:00
1   2        1            A 2014-06-20 02:00:00
2   3        1            B 2014-06-20 03:00:00
3   4        4            A 2014-06-20 04:00:00
4   5        5            A 2014-06-20 05:00:00
5   6        5            B 2014-06-20 06:00:00
6   7        7            A 2014-06-20 07:00:00
7   8        7            A 2014-06-20 08:00:00
8   9        7            A 2014-06-20 09:30:00
9  10        7            B 2014-06-20 10:00:00

[10 rows x 4 columns]

Then we can group by USER_ID and CONTRACT_REF, and select the SUBMISSION_DATE column:

>>> gs = df.groupby(["USER_ID", "CONTRACT_REF"])["SUBMISSION_DATE"]
>>> gs
<pandas.core.groupby.SeriesGroupBy object at 0xa7af08c>

Then we can take the difference of each group:

>>> gs.diff()
0        NaT
1   01:00:00
2        NaT
3        NaT
4        NaT
5        NaT
6        NaT
7   01:00:00
8   01:30:00
9        NaT
dtype: timedelta64[ns]

NaT, Not-a-Time, is the temporal equivalent of NaN. We can fill these with 0:

>>> gs.diff().fillna(0)
0   00:00:00
1   01:00:00
2   00:00:00
3   00:00:00
4   00:00:00
5   00:00:00
6   00:00:00
7   01:00:00
8   01:30:00
9   00:00:00
dtype: timedelta64[ns]

And since you want things to be measured in hours, we can divide by a timedelta of 1 hour:

>>> gs.diff().fillna(0) / pd.datetools.timedelta(hours=1)
0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    1.0
8    1.5
9    0.0
dtype: float64

Assign this to the frame:

>>> df["TIME_DIFF"] = gs.diff().fillna(0) / pd.datetools.timedelta(hours=1)

And we're done:

>>> df
    #  USER_ID CONTRACT_REF     SUBMISSION_DATE  TIME_DIFF
0   1        1            A 2014-06-20 01:00:00        0.0
1   2        1            A 2014-06-20 02:00:00        1.0
2   3        1            B 2014-06-20 03:00:00        0.0
3   4        4            A 2014-06-20 04:00:00        0.0
4   5        5            A 2014-06-20 05:00:00        0.0
5   6        5            B 2014-06-20 06:00:00        0.0
6   7        7            A 2014-06-20 07:00:00        0.0
7   8        7            A 2014-06-20 08:00:00        1.0
8   9        7            A 2014-06-20 09:30:00        1.5
9  10        7            B 2014-06-20 10:00:00        0.0

[10 rows x 5 columns]

Calculate difference between values in consecutive rows by group

The package data.table can do this fairly quickly, using the shift function.

require(data.table)
df <- data.table(group = rep(c(1, 2), each = 3), value = c(10,20,25,5,10,15))
#setDT(df) #if df is already a data frame

df[ , diff := value - shift(value), by = group]    
#   group value diff
#1:     1    10   NA
#2:     1    20   10
#3:     1    25    5
#4:     2     5   NA
#5:     2    10    5
#6:     2    15    5
setDF(df) #if you want to convert back to old data.frame syntax

Or using the lag function in dplyr

df %>%
    group_by(group) %>%
    mutate(Diff = value - lag(value))
#   group value  Diff
#   <int> <int> <int>
# 1     1    10    NA
# 2     1    20    10
# 3     1    25     5
# 4     2     5    NA
# 5     2    10     5
# 6     2    15     5

For alternatives pre-data.table::shift and pre-dplyr::lag, see edits.

R: Calculate difference between values in rows with group reference

Try the code below

transform(
    df,
    Diff = ave(value, group, FUN = function(x) c(NA, diff(x)))
)

which gives

  group value Diff
1     1    10   NA
2     1    20   10
3     1    25    5
4     2     5   NA
5     2    10    5
6     2    15    5

Calculating time difference in hours between two different rows per gorup with a 'lag' in one column

You have a parentheses out of place. .shift() was outside your parenthesis. fixed here.

df['LAYOVER']=df.groupby('CODE').apply(lambda x:x['DEPART']- x['ARRIVAL'].shift(1)).fillna('0').reset_index(drop=True)
df['LAYOVER'].apply(lambda x: pd.Timedelta(x).seconds)

0        0
1     5040
2    63360
3     2880
4        0
5    23700
6        0
Name: LAYOVER, dtype: int64

this is more concise:

df.groupby('CODE').apply(lambda x:x['DEPART']- x['ARRIVAL'].shift(1)).fillna('0').reset_index(drop=True).dt.seconds

How to calculate time difference in consecutive rows

When you just add default = strptime(v_time, "%d/%m/%Y %H:%M")[1] to the lag part:

df <- df %>%
  arrange(visitor, v_time) %>%
  group_by(visitor) %>%
  mutate(diff = strptime(v_time, "%d/%m/%Y %H:%M") - lag(strptime(v_time, "%d/%m/%Y %H:%M"), default = strptime(v_time, "%d/%m/%Y %H:%M")[1]),
         diff_secs = as.numeric(diff, units = 'secs'))

you get the result you expect:

> df
# A tibble: 8 x 6
# Groups:   visitor [3]
  visitor v_time         payment items diff   diff_secs
  <fct>   <fct>            <dbl> <dbl> <time>     <dbl>
1 David   1/2/2018 16:12     25.    2. 0             0.
2 David   1/2/2018 16:21     25.    5. 540         540.
3 Jack    1/2/2018 16:07     35.    3. 0             0.
4 Jack    1/2/2018 16:09    160.    1. 120         120.
5 Jack    1/2/2018 16:32     85.    5. 1380       1380.
6 Jack    1/2/2018 16:55      6.    2. 1380       1380.
7 Kate    1/2/2018 16:16      3.    3. 0             0.
8 Kate    1/2/2018 16:33    639.    3. 1020       1020.

Another option is to use difftime:

df <- df %>%
  arrange(visitor, v_time) %>%
  group_by(visitor) %>%
  mutate(diff = difftime(strptime(v_time, "%d/%m/%Y %H:%M"), lag(strptime(v_time, "%d/%m/%Y %H:%M"), default = strptime(v_time, "%d/%m/%Y %H:%M")[1]), units = 'mins'),
         diff_secs = as.numeric(diff, units = 'secs'))

now the diff-column is in minutes and the diff_sec-column is in seconds:

> df
# A tibble: 8 x 6
# Groups:   visitor [3]
  visitor v_time         payment items diff   diff_secs
  <fct>   <fct>            <dbl> <dbl> <time>     <dbl>
1 David   1/2/2018 16:12     25.    2. 0             0.
2 David   1/2/2018 16:21     25.    5. 9           540.
3 Jack    1/2/2018 16:07     35.    3. 0             0.
4 Jack    1/2/2018 16:09    160.    1. 2           120.
5 Jack    1/2/2018 16:32     85.    5. 23         1380.
6 Jack    1/2/2018 16:55      6.    2. 23         1380.
7 Kate    1/2/2018 16:16      3.    3. 0             0.
8 Kate    1/2/2018 16:33    639.    3. 17         1020.

You can now save the result again with write.csv(df,"C:/output.csv", row.names = FALSE)

how to calculate time difference between dates by group

if we need to group by 'location'

library(dplyr)
df %>%
    group_by(location) %>%
    mutate(time.diff.mins = difftime(date.time, lag(date.time), unit = 'min'))