R: Calculate the Number of Occurrences of a Specific Event in a Specified Time Future

R: calculate the number of occurrences of a specific event in a specified time future

Adding another approach based on findInterval:

cs = cumsum(df$x) # cumulative number of occurences
data.frame(df,
plus14 = cs[findInterval(df$date + 14, df$date, left.open = TRUE)] - cs,
plus30 = cs[findInterval(df$date + 30, df$date, left.open = TRUE)] - cs)
# x date plus14 plus30
#1 1 2016-01-01 1 3
#2 0 2016-01-05 1 4
#3 1 2016-01-07 2 3
#4 0 2016-01-12 2 3
#5 0 2016-01-16 2 3
#6 1 2016-01-20 2 2
#7 1 2016-01-20 1 1
#8 0 2016-01-25 1 1
#9 0 2016-01-26 1 1
#10 1 2016-01-31 0 0

Calculate the number of occurrences of a specific event in the past AND future with groupings

Here's one way:

library(data.table)
orderDT = with(df, data.table(id = user_id, completed = !cancelled_order, d = order_date))

vec = list(minus = 14L, plus = 14L)
orderDT[, c("dplus", "dminus") := .(
orderDT[!(completed)][orderDT[, .(id, d_plus = d + vec$plus, d_tom = d + 1L)], on=.(id, d <= d_plus, d >= d_tom), .N, by=.EACHI]$N
,
orderDT[!(completed)][orderDT[, .(id, d_minus = d - vec$minus, d_yest = d - 1L)], on=.(id, d >= d_minus, d <= d_yest), .N, by=.EACHI]$N
)]

id completed d dplus dminus
1: 1 TRUE 2015-01-28 2 0
2: 1 FALSE 2015-01-31 1 0
3: 1 FALSE 2015-02-08 0 1
4: 1 TRUE 2015-02-23 0 0
5: 1 TRUE 2015-03-23 0 0
6: 2 TRUE 2015-01-25 2 0
7: 2 FALSE 2015-01-28 1 0
8: 2 FALSE 2015-02-06 0 1
9: 2 TRUE 2015-02-21 0 0
10: 2 TRUE 2015-03-26 0 0

(I found OP's column names cumbersome and so shortened them.)


How it works

Each of the columns can be run on its own, like

orderDT[!(completed)][orderDT[, .(id, d_plus = d + vec$plus, d_tom = d + 1L)], on=.(id, d <= d_plus, d >= d_tom), .N, by=.EACHI]$N

And this can be broken down into steps by simplifying:

orderDT[!(completed)][
orderDT[, .(id, d_plus = d + vec$plus, d_tom = d + 1L)],
on=.(id, d <= d_plus, d >= d_tom),
.N,
by=.EACHI]$N
# original version

orderDT[!(completed)][
orderDT[, .(id, d_plus = d + vec$plus, d_tom = d + 1L)],
on=.(id, d <= d_plus, d >= d_tom),
.N,
by=.EACHI]
# don't extract the N column of counts

orderDT[!(completed)][
orderDT[, .(id, d_plus = d + vec$plus, d_tom = d + 1L)],
on=.(id, d <= d_plus, d >= d_tom)]
# don't create the N column of counts

orderDT[!(completed)]
# don't do the join

orderDT[, .(id, d_plus = d + vec$plus, d_tom = d + 1L)]
# see the second table used in the join

This uses a "non-equi" join, taking inequalities to define the date ranges. For more details, see the documentation page found by typing ?data.table.

R: Calculating the number of occurrences within a specific time period in the past for each unique individual in a dataset in R

I'm not sure this meets your needs, but this is based on @Axeman's tidyverse solution you linked to. After group_by your CUSTOMER_ID you can sum all rows with trip_scheduled is 1 and dates fall between current date and 60 days prior. I would expect you could do something similar for same_day_cancel as well.

library(tidyverse)

test_set2 %>%
group_by(CUSTOMER_ID) %>%
mutate(schedule_60 = unlist(map(trip_date, ~sum(trip_scheduled == 1 & between(trip_date, . - 60, .))))) %>%
print(n=30)

# A tibble: 30 x 6
# Groups: CUSTOMER_ID [3]
tripID CUSTOMER_ID trip_date trip_scheduled same_day_cancel schedule_60
<chr> <int> <date> <dbl> <dbl> <int>
1 20180112-100037-674-101 100037 2018-01-12 1 1 3
2 20180112-100037-674-201 100037 2018-01-12 1 1 3
3 20180112-100037-674-301 100037 2018-01-12 1 1 3
4 20180113-100037-676-101 100037 2018-01-13 1 0 5
5 20180113-100037-676-201 100037 2018-01-13 1 0 5
6 20180115-100037-675-101 100037 2018-01-15 1 1 7
7 20180115-100037-675-201 100037 2018-01-15 1 1 7
8 20180116-100037-677-101 100037 2018-01-16 1 0 9
9 20180116-100037-677-201 100037 2018-01-16 1 0 9
10 20180131-100037-678-101 100037 2018-01-31 1 0 10
11 20180101-100146-707-101 100146 2018-01-01 1 1 2
12 20180101-100146-707-201 100146 2018-01-01 1 1 2
13 20180102-100146-708-101 100146 2018-01-02 1 1 4
14 20180102-100146-708-201 100146 2018-01-02 1 1 4
15 20180103-100146-709-101 100146 2018-01-03 1 1 6
16 20180103-100146-709-201 100146 2018-01-03 1 1 6
17 20180104-100146-710-101 100146 2018-01-04 1 1 8
18 20180104-100146-710-201 100146 2018-01-04 1 1 8
19 20180105-100146-711-101 100146 2018-01-05 1 1 10
20 20180105-100146-711-201 100146 2018-01-05 1 1 10
21 20180403-100532-223-101 100532 2018-04-03 1 0 2
22 20180403-100532-223-201 100532 2018-04-03 1 0 2
23 20180620-100532-224-101 100532 2018-06-20 1 0 2
24 20180620-100532-224-201 100532 2018-06-20 1 0 2
25 20180704-100532-225-101 100532 2018-07-04 1 0 4
26 20180704-100532-225-201 100532 2018-07-04 1 0 4
27 20180926-100532-228-101 100532 2018-09-26 1 0 2
28 20180926-100532-228-201 100532 2018-09-26 1 0 2
29 20180927-100532-226-101 100532 2018-09-27 1 0 4
30 20180927-100532-226-201 100532 2018-09-27 1 0 4

R: calculate number of distinct categories in the specified time frame

In the tidyverse, you can use map_int to iterate over a set of values and simplify to an integer à la sapply or vapply. Count distinct occurrences with n_distinct (like length(unique(...))) of an object subset by comparisons or the helper between, with a minimum set by the appropriate amount subtracted from that day, and you're set.

library(tidyverse)

df %>% group_by(user_id) %>%
mutate(distinct_7 = map_int(date, ~n_distinct(category[between(date, .x - 7, .x)])),
distinct_14 = map_int(date, ~n_distinct(category[between(date, .x - 14, .x)])))

## Source: local data frame [14 x 5]
## Groups: user_id [2]
##
## user_id date category distinct_7 distinct_14
## <int> <date> <fctr> <int> <int>
## 1 27 2016-01-01 apple 1 1
## 2 27 2016-01-03 apple 1 1
## 3 27 2016-01-05 pear 2 2
## 4 27 2016-01-07 plum 3 3
## 5 27 2016-01-10 apple 3 3
## 6 27 2016-01-14 pear 3 3
## 7 27 2016-01-16 plum 3 3
## 8 11 2016-01-01 apple 1 1
## 9 11 2016-01-03 pear 2 2
## 10 11 2016-01-05 pear 2 2
## 11 11 2016-01-07 pear 2 2
## 12 11 2016-01-10 apple 2 2
## 13 11 2016-01-14 apple 2 2
## 14 11 2016-01-16 apple 1 2

Count occurrences of a factor variable between two posixCT dates

It's generally a sub-optimal idea to use loops in r as any moderate dataset will get really bogged down. Reason for this is specified in other Stack Overflow answers if you are interested. Best practice in r is if you are thinking of using a for loop to go through all records in a data frame, use the apply family of functions instead.

I wrote some quick code below which should do the trick fairly quickly and builds on the code you supplied.

df$row <- rownames(df)

#Column 1: count occurrences +/- 2hrs within same city
df$col1 <- mapply(function(time, city, row) sum(df[df$row != row,"two.Before"] <= time & df[df$row != row,"two.After"] >= time & df[df$row != row,"loc"] == city),
df$time, df$loc, df$row)

#Column2: count occurrences +/- 2hrs within same city and on a bus
df$col2 <- mapply(function(time, city, bus, row) sum(df[df$row != row,"two.Before"] <= time & df[df$row != row,"two.After"] >= time & df[df$row != row,"loc"] == city & df[df$row != row,"type"] == "Bus"),
df$time, df$loc, df$type, df$row)

#Remove row index
df <- subset(df, select = -c(row))

Let me know if you have any questions. It seems to work as intended for a few data points I tested. This assumes you don't want to count that row in your two counts otherwise all col1 counts would be 1.

Edit

Code to only review the two hours prior to the incident is below (again builds on the code provided).

df$row <- as.numeric(rownames(df))

#Column 1: count occurrences +/- 2hrs within same city
df$col1 <- mapply(function(time, city, row) sum(df[df$row != row,"time"] <= time & df[df$row != row,"two.After"] >= time & df[df$row != row,"loc"] == city),
df$time, df$loc, df$row)

#Column2: count occurrences +/- 2hrs within same city and on a bus
df$col2 <- mapply(function(time, city, bus, row) sum(df[df$row != row,"time"] <= time & df[df$row != row,"two.After"] >= time & df[df$row != row,"loc"] == city & df[df$row != row,"type"] == "Bus"),
df$time, df$loc, df$type, df$row)

How do I compute the number of occurrences of a particular value in a row in R

There're two ways to use the apply family functions. Either you do

apply(mat, 1, sum, na.rm=TRUE)

if you want to apply the function sum()to each row, passing additional parameters like na.rm=TRUE. Or you can do

apply(mat, 1, foo)

where foo() is a function of your own, defined externally, e.g.

foo <- function(x) sum(x==0, na.rm=TRUE)

Note that NA handling might also be dealt with a parameter of the function itself, with default value set to TRUE, in the above definition, as in

foo2 <- function(x, no.na=TRUE) sum(x==0, na.rm=no.na)

and you can call it as apply(mat, 1, foo2, no.na=F) although it doesn't really make sense with the sum() function (unless you want to check if there're NA values, but in this case it's better to use is.na() :-).

Finally, you can define your function directly inline as

apply(mat, 1, function(x) sum(x==0, na.rm=TRUE))

In your case, it gives me

> apply(mat, 1, function(x) sum(x==0, na.rm=TRUE))
1 2 3 4 5 6
22 4 9 8 7 2

which is equivalent to apply(ex, 1, foo).

Count number of occurrence within time frame in R

Based on your comments, for any number the start of the period is the earliest call from that number.
Below is the commented code:

library(lubridate)                                                              
library(dplyr)

calls <- structure(list(Date_Time = structure(1:6, .Label = c("12/1/2015 12:00:01 AM",
"12/1/2015 12:00:29 AM", "12/1/2015 12:00:41 AM", "12/1/2015 12:00:43 AM",
"12/1/2015 9:00:02 AM", "12/2/2015 12:00:02 AM"), class = "factor"),
Number = structure(c(4L, 3L, 2L, 1L, 4L, 3L), .Label = c("12222222",
"22333333", "32211111", "92222222"), class = "factor")), .Names = c("Date_Time",
"Number"), row.names = c(NA, -6L), class = "data.frame")

count_freq <- function(timestamps){
#Given all the ocurrences of calls from a number find the
#earliest one and count how many occur within 24 hours
dtime <- sort(mdy_hms(timestamps))
start_time <- dtime[1]
end_time <- start_time + hours(24)
sum(dtime >= start_time & dtime <= end_time)
}

out <- group_by(calls, Number) %>%
summarise(freq = count_freq(Date_Time))

Conting events between sequential stages in a process using R

Here is a data.table apprioach, resulting in a list of alerts after a-b-c..

library(data.table)
# Make tables data.table format
setDT(TableA)
setDT(TableB)
# set TiMESTAP to numeric
TableA[, TIMESTAMP := as.numeric(TIMESTAMP)]
TableB[, TIMESTAMP := as.numeric(TIMESTAMP)]
# Create data.table with Stage intervals by test subject
DT.interval <- TableA[, .(start = min(TIMESTAMP)), by = .(TEST_SUBJECT, STAGE)]
# Perform rolling join
TableB[, Stage := DT.interval[TableB,
STAGE,
on = .(TEST_SUBJECT, start = TIMESTAMP),
roll = Inf]][]
# Split alerts by stage
split(TableB[,3:4], by = "Stage")
# $A
# ALERT_CODE Stage
# 1: AYUJ-151571406 A
# 2: AYUJ-487008829 A
# 3: AYUJ-1300211351 A
# 4: AYUJ-4454800551 A
# 5: AYUJ-1079921935 A
# 6: AYUJ-4778326278 A
#
# $B
# ALERT_CODE Stage
# 1: AYUJ-211990388 B
# 2: AYUJ-4177221842 B
# 3: AYUJ-3014305494 B
# 4: AYUJ-3348911727 B
# 5: AYUJ-2381219626 B
#
# $C
# ALERT_CODE Stage
# 1: AYUJ-3915716168 C


Related Topics



Leave a reply



Submit