Grouping Every N Minutes with Dplyr

Grouping every n minutes with dplyr

lubridate-dplyr-esque solution.

library(lubridate)
library(dplyr)
d2 <- data.frame(interval = seq(ymd_hms('2010-05-21 00:00:00'), by = '3 min',length.out=(1440/3)))
d3 <- d1 %>%
mutate(interval = floor_date(date, unit="hour")+minutes(floor(minute(date)/3)*3)) %>%
group_by(interval) %>%
mutate(sumvalue=sum(value)) %>%
select(interval,sumvalue)
d4 <- merge(d2,d3, all=TRUE) # better if left_join is used
tail(d4)
# interval sumvalue
#475 2010-05-21 23:42:00 NA
#476 2010-05-21 23:45:00 NA
#477 2010-05-21 23:48:00 NA
#478 2010-05-21 23:51:00 NA
#479 2010-05-21 23:54:00 NA
#480 2010-05-21 23:57:00 NA
d4[450,]
# interval sumvalue
#450 2010-05-21 22:27:00 643426

If you are comfortable working with Date (I am not), you can dispense with lubridate, and replace the final merge with left_join.

Aggregating data at 15 minutes interval based on date and hour in R

Not exactly giving your expected results though, but perhaps usable.
You can see if this fits your needs.

library(data.table)
setDT(df)

df[, Time := ymd_hm(Time)]
df[, groups := lubridate::round_date(Time, "15 minutes")]
df[, .(Distance_m_sum = sum(Distance_m)), by = groups]

groups Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:15:00 324
4: 2021-08-30 08:30:00 162
5: 2021-08-31 02:45:00 469
6: 2021-08-31 03:00:00 137
7: 2021-08-31 07:45:00 302
8: 2021-08-31 06:00:00 42

More extended example

You have to define your quarters I think, there are with the lubridate approach three options, round_date, floor_date and ceiling_date. Rethinking my own example I would pick floor_date as 2021-08-30 7:24 falls in the 7:15-7:30 group. To see all variants:

library(data.table)
setDT(df)

df[, Time := ymd_hm(Time)]
df[, round_date := lubridate::round_date(Time, "15 minutes")]
df[, floor_date := lubridate::floor_date(Time, "15 minutes")]
df[, ceiling_date := lubridate::ceiling_date(Time, "15 minutes")]

df[, .(Distance_m_sum = sum(Distance_m)), by = round_date]
round_date Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:15:00 324
4: 2021-08-30 08:30:00 162
5: 2021-08-31 02:45:00 469
6: 2021-08-31 03:00:00 137
7: 2021-08-31 07:45:00 302
8: 2021-08-31 06:00:00 42

df[, .(Distance_m_sum = sum(Distance_m)), by = floor_date]
floor_date Distance_m_sum
1: 2021-08-30 07:15:00 162
2: 2021-08-30 07:30:00 162
3: 2021-08-30 07:45:00 162
4: 2021-08-30 08:15:00 486
5: 2021-08-31 02:30:00 319
6: 2021-08-31 02:45:00 287
7: 2021-08-31 07:30:00 122
8: 2021-08-31 07:45:00 180
9: 2021-08-31 06:00:00 42

df[, .(Distance_m_sum = sum(Distance_m)), by = ceiling_date]
ceiling_date Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:30:00 486
4: 2021-08-31 02:45:00 319
5: 2021-08-31 03:00:00 287
6: 2021-08-31 07:45:00 224
7: 2021-08-31 08:00:00 78
8: 2021-08-31 06:15:00 42

dplyr: getting grouped min and max of columns in a for loop

Using dplyr package, you can get:

df %>%
na.omit() %>%
pivot_longer(-group) %>%
group_by(group, name) %>%
summarise(min = min(value),
max = max(value)) %>%
arrange(name, group)

# group name min max
# <dbl> <chr> <int> <int>
# 1 1 a 1 3
# 2 2 a 4 5
# 3 1 b 6 8
# 4 2 b 9 10
# 5 1 c 11 13
# 6 2 c 14 15

Aggregate time to 10 minute

You can adapt one of those solutions according to your data after changing time to POSIXct format.

library(dplyr)
library(lubridate)

df %>%
mutate(Time = dmy_hm(Time),
Time = ceiling_date(Time, '10 min')) %>%
group_by(Time) %>%
summarise(kW_Raw_Data = sum(kW_Raw_Data, na.rm = TRUE)) %>%
na.omit()

# Time kW_Raw_Data
# <dttm> <dbl>
#1 2015-01-06 23:40:00 25.8
#2 2015-01-06 23:50:00 27.1
#3 2015-01-07 00:00:00 23.1
#4 2015-01-07 00:10:00 12.1

How to randomly sample entire group based on multiple grouping conditions

You can use lubridate::floor_date to create groups and then filter one randomly sampled frame per group. You can manually set the interval you need in floor_date, here it's "1 minute".

df %>% 
mutate(datetime = ymd_hms(datetime),
fl = floor_date(datetime, "1 minute")) %>%
group_by(uniquename, fl) %>%
filter(frame == sample(unique(frame), 1))

output:

# A tibble: 11 × 5
# Groups: uniquename, floor [4]
uniquename frame id datetime fl
<chr> <dbl> <chr> <dttm> <dttm>
1 unique1 2 b1 2021-05-05 07:05:03 2021-05-05 07:05:00
2 unique1 2 b2 2021-05-05 07:05:03 2021-05-05 07:05:00
3 unique1 2 b3 2021-05-05 07:05:03 2021-05-05 07:05:00
4 unique1 3 b2 2021-05-05 07:07:03 2021-05-05 07:07:00
5 unique1 3 b4 2021-05-05 07:07:03 2021-05-05 07:07:00
6 unique2 1 b3 2021-06-06 09:17:25 2021-06-06 09:17:00
7 unique2 1 b4 2021-06-06 09:17:25 2021-06-06 09:17:00
8 unique2 16 b1 2021-06-06 09:20:59 2021-06-06 09:20:00
9 unique2 16 b2 2021-06-06 09:20:59 2021-06-06 09:20:00
10 unique2 16 b3 2021-06-06 09:20:59 2021-06-06 09:20:00
11 unique2 16 b4 2021-06-06 09:20:59 2021-06-06 09:20:00

R find min and max for each group based on other row

With tidyverse you can try the following approach. First, put your data into long form targeting your year columns. Then, group_by both group and name (which contains the year) and only include subgroups that have a value of x, and keep rows that have condition of 1. Then group_by just group and summarise to get the min and max years. Note, you may wish to convert your year data to numeric after removing x by filtering on condition.

library(tidyverse)

df1 %>%
pivot_longer(cols = -c(group, condition)) %>%
group_by(group, name) %>%
filter(any(value == "x"), condition == 1) %>%
group_by(group) %>%
summarise(min = min(value),
max = max(value))

Output

# A tibble: 3 x 3
group min max
<chr> <chr> <chr>
1 a 2010 2013
2 b 2011 2015
3 c 2010 2014

How to aggregate data in 10 minute-steps

We can use floor_date from lubridate package to cut time every 10 mins and take a lower bound, group by it and sum ZUL_T values.

library(dplyr)
library(lubridate)
library(tidyr)

df %>%
group_by(date = floor_date(ymd_hms(date), "10 mins")) %>%
summarise(ZUL_T = sum(ZUL_T))

# date ZUL_T
# <dttm> <dbl>
# 1 2019-01-01 00:00:00 23.3
# 2 2019-01-01 00:10:00 23.3
# 3 2019-01-01 00:20:00 19.9
# 4 2019-01-01 00:30:00 20.7
# 5 2019-01-01 00:40:00 21.9
# 6 2019-01-01 00:50:00 21.9
# 7 2019-01-01 01:10:00 18.8
# 8 2019-01-01 01:20:00 18.8
# 9 2019-01-01 01:30:00 20.7
#10 2019-01-01 01:40:00 21.6
#11 2019-01-01 01:50:00 19.2
#12 2019-01-01 02:00:00 19.2
#13 2019-01-01 02:10:00 19.6
#14 2019-01-01 02:20:00 19.6
#15 2019-01-01 02:40:00 20.5

and then use complete and fill to complete the missing combinations and fill the NA values with previous values.

df %>%
group_by(date = floor_date(ymd_hms(date), "10 mins")) %>%
summarise(ZUL_T = sum(ZUL_T))
complete(date = seq(min(date), max(date), "10 mins")) %>%
fill(ZUL_T)

# date ZUL_T
# <dttm> <dbl>
# 1 2019-01-01 00:00:00 23.3
# 2 2019-01-01 00:10:00 23.3
# 3 2019-01-01 00:20:00 19.9
# 4 2019-01-01 00:30:00 20.7
# 5 2019-01-01 00:40:00 21.9
# 6 2019-01-01 00:50:00 21.9
# 7 2019-01-01 01:00:00 21.9
# 8 2019-01-01 01:10:00 18.8
# 9 2019-01-01 01:20:00 18.8
#10 2019-01-01 01:30:00 20.7
#11 2019-01-01 01:40:00 21.6
#12 2019-01-01 01:50:00 19.2
#13 2019-01-01 02:00:00 19.2
#14 2019-01-01 02:10:00 19.6
#15 2019-01-01 02:20:00 19.6
#16 2019-01-01 02:30:00 19.6
#17 2019-01-01 02:40:00 20.5

data

df <- structure(list(date = structure(1:15, .Label = c("2019-01-01 00:04:00", 
"2019-01-01 00:15:00", "2019-01-01 00:26:00", "2019-01-01 00:37:00",
"2019-01-01 00:48:00", "2019-01-01 00:59:00", "2019-01-01 01:10:00",
"2019-01-01 01:22:00", "2019-01-01 01:33:00", "2019-01-01 01:44:00",
"2019-01-01 01:55:00", "2019-01-01 02:06:00", "2019-01-01 02:17:00",
"2019-01-01 02:29:00", "2019-01-01 02:40:00"), class = "factor"),
ZUL_T = c(23.3, 23.3, 19.9, 20.7, 21.9, 21.9, 18.8, 18.8,
20.7, 21.6, 19.2, 19.2, 19.6, 19.6, 20.5)),
class = "data.frame", row.names = c(NA,-15L))


Related Topics



Leave a reply



Submit