Dplyr: Grouping and Summarizing/Mutating Data with Rolling Time Windows

Using dplyr to count and mark based on type and rolling date in R

Here is a solution with dplyr:

Update based on OP edit

library(dplyr)
library(lubridate)
a <- data.frame("TYPE" = c("A", "A", "B", "B",
"C", "C", "C", "C",
"D", "D", "D", "D",
"D", "D", "D", "D"),
"DATE" = c("4/20/2018 11:47",
"4/25/2018 7:21",
"4/15/2018 6:11",
"4/19/2018 4:22",
"4/15/2018 17:46",
"4/16/2018 11:59",
"4/20/2018 7:50",
"4/26/2018 2:55",
"4/27/2018 11:46",
"4/27/2018 13:03",
"4/20/2018 7:31",
"4/22/2018 9:45",
"6/01/2018 9:07",
"6/03/2018 12:34",
"6/07/2018 1:57",
"6/10/2018 2:22"),
"CLASS" = c(1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4))

# a function to flag rows that are 4th or more within window w
count_window <- function(df, date, w, type){
min_date <- date - w
df2 <- df %>% filter(TYPE == type, YMD >= min_date, YMD <= date)
out <- n_distinct(df2$YMD)
res <- ifelse(out >= 4, 1, 0)
return(res)
}

v_count_window <- Vectorize(count_window, vectorize.args = c("date","type"))

res <- a %>% mutate(DATE = as.POSIXct(DATE, format = "%m/%d/%Y %H:%M")) %>%
mutate(YMD = date(DATE)) %>%
arrange(TYPE, YMD) %>%
#group_by(TYPE) %>%
mutate(min_date = YMD - 30,
count = v_count_window(., YMD, 30, TYPE)) %>%
group_by(TYPE) %>%
mutate(FLAG = case_when(
any(count == 1) & YMD >= min_date[match(1,count)] ~ 1,
TRUE ~ 0
))%>%
select(nms,FLAG)

I couldn't figure out how to use the group in a custom function so I hard coded the filtering by type into the function.

Mutate case_when date time data in R

You need to compare the data with the same class. Try the following :

library(dplyr)
library(magrittr)
library(lubridate)

data %<>%
mutate(period = case_when(
Fix_date_time >= ymd_hms('2021-01-15 19:00:00') &
Fix_date_time <= ymd_hms('2021-01-21 05:00:00') ~ "Period-1",
Fix_date_time >= ymd_hms('2021-01-21 19:00:00') &
Fix_date_time <= ymd_hms('2021-01-28 05:00:00') ~ "Period-2",
Fix_date_time >= ymd_hms('2021-01-29 19:00:00') &
Fix_date_time <= ymd_hms('2021-02-02 05:00:00') ~ "Period-3"))

R: Rolling sum on a non standard window

It's not possible to flag in advance your 3 month windows, because you want to go back 3 months from every date in your dataset and that means that your reference point (date) changes every time. Therefore you need a function that takes that into account and apply it on every row.

library(lubridate)
library(dplyr)

# sample dataset
dt = read.table(text="ID Operation date value
A 1 01/01/2017 0
A 2 01/02/2017 1
A 3 01/06/2017 1
A 4 01/09/2017 0
B 1 01/03/2017 0
B 2 01/05/2017 1
B 3 01/09/2017 1
B 4 01/10/2017 1", header=T, stringsAsFactors=F)

# function that goes 3 months back from a given date and a given ID
f = function(ID_input, date_input) {
enddate = date_input
startdate = date_input - months(3)
sum((dt %>% filter(ID == ID_input & date >= startdate & date <= enddate))$value) }

f = Vectorize(f)

# update date column
dt$date = dmy(dt$date)

# run function for every row
dt %>% mutate(sumvalue = f(ID, date))

# ID Operation date value sumvalue
# 1 A 1 2017-01-01 0 0
# 2 A 2 2017-02-01 1 1
# 3 A 3 2017-06-01 1 1
# 4 A 4 2017-09-01 0 1
# 5 B 1 2017-03-01 0 0
# 6 B 2 2017-05-01 1 1
# 7 B 3 2017-09-01 1 1
# 8 B 4 2017-10-01 1 2

Rolling window slider::slide() with grouped data

You have to first tidyr::nest the cases. Within the nested tibbles (accessed via purrr::map) you can then apply slide (same technique as with purrr::map). The important point is that you do not want to slide across cases, but only within cases.

library(dplyr)
library(tidyr)
library(purrr)
library(slider)

get_coef1 <- function(data) {
coef1 <- lm(data = data, r1 ~ r2 + r3) %>%
coef() %>%
.["r2"] %>%
unname()

return(coef1)
}

data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))

data %>%
# ungroup() %>%
group_by(case) %>% nest() %>%
mutate(rollreg = map(data, ~ .x %>% mutate(coef1 = slider::slide_dbl(., ~get_coef1(.x), .before = Inf, .complete = TRUE)))) %>%
select(-data) %>% unnest(rollreg)

I have been trying for a while to use the new dplyr::nest_by() from dplyr 1.0.0 trying to use summarise in combination with the rowwise cases but couldn't get that to work.

Using dplyr to average time series groups with individuals of different lengths

You could try:

library(ggplot2)
library(dplyr)

dat %>%
group_by(ID) %>%
mutate(maxtime = max(Time)) %>%
group_by(group) %>%
mutate(maxtime = min(maxtime)) %>%
group_by(group, Time) %>%
summarize(values = mean(values)) %>%
ggplot(aes(Time, values, colour = group)) + geom_line()

Sample Image

Is there a way to sum data grouping by date with a time period?

1) Base R Using the data shown reproducibly in the Note at the end lapply over erach row expanding the date range into a sequence of dates using seq. This gives a list with one component per input row and we rbind those together giving long. Then aggregate long by Date. No packages are used.

expand <- function(i, data) with(data[i, ], 
data.frame(Date = seq(START, END, "day"), NUMBER)
)

long <- do.call("rbind", lapply(1:nrow(DF), expand, data = DF))
result <- aggregate(NUMBER ~ Date, long, sum)

head(result)

giving:

        Date NUMBER
1 2020-03-16 12
2 2020-03-17 13
3 2020-03-18 13
4 2020-03-19 13
5 2020-03-20 13
6 2020-03-21 13

2) dplyr Expand each row in the rowwise code and then sum NUMBER over Date in the group_by code.

library(dplyr)

DF %>%
rowwise %>%
do(data.frame(Date = seq(.$START, .$END, "day"), NUMBER = .$NUMBER)) %>%
ungroup %>%
group_by(Date) %>%
summarize(NUMBER = sum(NUMBER)) %>%
ungroup

Note

Lines <- "       START        END NUMBER
1 2020-03-16 2020-05-31 5
2 2020-03-16 2020-06-30 7
3 2020-03-17 2020-08-31 1"
DF <- read.table(text = Lines)
DF[1:2] <- lapply(DF[1:2], as.Date)


Related Topics



Leave a reply



Submit