Creating Sequence of Dates for Each Group in R

creating sequence of dates for each group in r

You could use data.table to get the sequence of Dates from 'created_at' to '2015-07-12', grouped by the 'ID' column.

 library(data.table)
setDT(df1)[, list(date=seq(created_at, as.Date('2015-07-12'), by='1 day')) , ID]

If you need an option with dplyr, use do

 library(dplyr)
df1 %>%
group_by(ID) %>%
do( data.frame(., Date= seq(.$created_at,
as.Date('2015-07-12'), by = '1 day')))

If you have duplicate IDs, then we may need to group by row_number()

df1 %>%
group_by(rn=row_number()) %>%
do(data.frame(ID= .$ID, Date= seq(.$created_at,
as.Date('2015-07-12'), by = '1 day'), stringsAsFactors=FALSE))

Update

Based on @Frank's commment, the new idiom for tidyverse is

library(tidyverse)
df1 %>%
group_by(ID) %>%
mutate(d = list(seq(created_at, as.Date('2015-07-12'), by='1 day')), created_at = NULL) %>%
unnest()

In the case of data.table

setDT(df1)[, list(date=seq(created_at, 
as.Date('2015-07-12'), by = '1 day')), by = 1:nrow(df1)]

data

df1 <- structure(list(ID = c("MUM-0001", "MUM-0002", "MUM-0003",
"MUM-0004",
"MUM-0005", "MUM-0006"), created_at = structure(c(16176, 16084,
16177, 16172, 16178, 16177), class = "Date")), .Names = c("ID",
"created_at"), row.names = c(NA, -6L), class = "data.frame")

Creating sequence of dates in R by group, dependent on another variable

If there are more than one row, the seq needs to be looped. We can use map2. Also, based on the format of the 'DATE' columns, the as.Date needs a format argument i.e. as.Date(ADATE, "%m/%d/%y") (assuming it is month/day/year format)

library(dplyr)
library(purrr)
library(lubridate)
chf_jan15 %>%
mutate_at(vars(ends_with("DATE")), mdy) %>%
mutate(random_date = map2(ADATE, DDATE, seq, by = "day")) %>%
unnest(c(random_date))
# A tibble: 4 x 4
# ID ADATE DDATE random_date
# <int> <date> <date> <date>
#1 1 2010-02-04 2010-02-07 2010-02-04
#2 1 2010-02-04 2010-02-07 2010-02-05
#3 1 2010-02-04 2010-02-07 2010-02-06
#4 1 2010-02-04 2010-02-07 2010-02-07

If there is only a single row, after converting to Date class, the complete should work

library(tidyr)
chf_jan15 %>%
mutate_at(vars(ends_with("DATE")), as.Date, format = "%m/%d/%y") %>%
mutate(NEW_DATE = ADATE) %>%
complete(NEW_DATE = seq(ADATE, DDATE, by = 'day')) %>%
fill(c(ID, ADATE, DDATE))
# A tibble: 4 x 4
# NEW_DATE ID ADATE DDATE
# <date> <int> <date> <date>
#1 2010-02-04 1 2010-02-04 2010-02-07
#2 2010-02-05 1 2010-02-04 2010-02-07
#3 2010-02-06 1 2010-02-04 2010-02-07
#4 2010-02-07 1 2010-02-04 2010-02-07

If there is a single row for each each 'ID', then we can group_split and use complete

chf_jan15 %>%
mutate_at(vars(ends_with("DATE")), as.Date, format = "%m/%d/%y") %>%
mutate(NEW_DATE = ADATE) %>%
group_split(ID) %>%
map_dfr(~ .x %>%
complete(NEW_DATE = seq(ADATE, DDATE, by = 'day')) %>%
fill(c(ID, ADATE, DDATE)))

data

chf_jan15 <- structure(list(ID = 1L, ADATE = "02/04/10", 
DDATE = "02/07/10"), class = "data.frame", row.names = c(NA,
-1L))

Sequence a group of dates in R

You can try this

rep(seq(as.Date("2020-01-01"),as.Date("2020-01-10"),1),each=26)

This will return a list of dates from 2020-01-01 to 2020-01-10 where each date will be repeated 26 times

How to generate sequence of dates by group

We group by sequence of rows as there are duplicate elements for the "ID", "Tag" group.

dt[, list(Date = seq(Begin, length.out=3, by = '1 month'), x,y,z), by = 1:nrow(dt)]

Or as @David Arenburg mentioned, we replicate the rows by "N" and then group by "ID", "Tag" by selecting only the first observation of "Begin"

 dt[rep(1:.N, each = 3)][, Begin := seq(Begin[1L],
length.out=3, by = '1 month'), by = .(ID, Tag)][]

group_by and create a sequence of monthly dates

If we need to create a sequence of dates for each 'start_date' and its corresponding 'end_date', it can be done with map2 and here it doesn't need any grouping as it gets the sequence from each corresponding 'start_date/end_date'

library(purrr)
df %>%
mutate(Seq = map2(start_date, end_date, seq, by = '1 day'))

Update

Based on the OP's comments

df %>%  
group_by(cusip) %>%
mutate(rn = row_number()) %>%
filter(cummax(date >= start_date & date <= end_date) > 0)
# A tibble: 102 x 5
# Groups: cusip [1]
# cusip date start_date end_date rn
# <chr> <date> <date> <date> <int>
# 1 00036020 2011-07-29 2011-07-29 2012-06-30 7
# 2 00036020 2011-08-31 2011-07-29 2012-06-30 8
# 3 00036020 2011-09-30 2011-07-29 2012-06-30 9
# 4 00036020 2011-10-31 2011-07-29 2012-06-30 10
# 5 00036020 2011-11-30 2011-07-29 2012-06-30 11
# 6 00036020 2011-12-30 2011-07-29 2012-06-30 12
# 7 00036020 2012-01-31 2012-07-31 2013-06-30 13
# 8 00036020 2012-02-29 2012-07-31 2013-06-30 14
# 9 00036020 2012-03-30 2012-07-31 2013-06-30 15
#10 00036020 2012-04-30 2012-07-31 2013-06-30 16
# … with 92 more rows

-checking the first 24 rows

Creating Episodes for Groups based on Date Sequence

Here might be one approach. I added the newer output with group_by as Episode2 next to Episode created from the initial example. Hope this is helpful.

library(tidyverse)

df %>%
group_by(ID) %>%
mutate(
Episode2 = {
r <- rle(EpisodeTimeCriterian)
r$values <- cumsum(rep(1, length(r$values)))
inverse.rle(r)
}
) %>%
print(n=66)

Output

# A tibble: 66 x 6
# Groups: ID [2]
ID Date days_until_next EpisodeTimeCriterian Episode Episode2
<chr> <date> <dbl> <lgl> <int> <dbl>
1 123 2013-10-08 7 TRUE 1 1
2 123 2013-10-15 7 TRUE 1 1
3 123 2013-10-22 7 TRUE 1 1
4 123 2013-10-29 7 TRUE 1 1
5 123 2013-11-05 7 TRUE 1 1
6 123 2013-11-12 7 TRUE 1 1
7 123 2013-11-19 7 TRUE 1 1
8 123 2013-11-26 7 TRUE 1 1
9 123 2013-12-03 14 TRUE 1 1
10 123 2013-12-17 10 TRUE 1 1
11 123 2013-12-27 11 TRUE 1 1
12 123 2014-01-07 7 TRUE 1 1
13 123 2014-01-14 7 TRUE 1 1
14 123 2014-01-21 2 TRUE 1 1
15 123 2014-01-23 1 TRUE 1 1
16 123 2014-01-24 4 TRUE 1 1
17 123 2014-01-28 7 TRUE 1 1
18 123 2014-02-04 1 TRUE 1 1
19 123 2014-02-05 27 TRUE 1 1
20 123 2014-03-04 1997 FALSE 2 2
21 123 2019-08-22 7 TRUE 3 3
22 123 2019-08-29 2 TRUE 3 3
23 123 2019-08-31 5 TRUE 3 3
24 123 2019-09-05 7 TRUE 3 3
25 123 2019-09-12 13 TRUE 3 3
26 123 2019-09-25 12 TRUE 3 3
27 123 2019-10-07 14 TRUE 3 3
28 123 2019-10-21 7 TRUE 3 3
29 123 2019-10-28 7 TRUE 3 3
30 123 2019-11-04 7 TRUE 3 3
31 123 2019-11-11 7 TRUE 3 3
32 123 2019-11-18 7 TRUE 3 3
33 123 2019-11-25 7 TRUE 3 3
34 123 2019-12-02 7 TRUE 3 3
35 123 2019-12-09 7 TRUE 3 3
36 123 2019-12-16 7 TRUE 3 3
37 123 2019-12-23 1 TRUE 3 3
38 123 2019-12-24 13 TRUE 3 3
39 123 2020-01-06 7 TRUE 3 3
40 123 2020-01-13 7 TRUE 3 3
41 123 2020-01-20 7 TRUE 3 3
42 123 2020-01-27 -2302 TRUE 3 3
43 456 2013-10-07 7 TRUE 3 1
44 456 2013-10-14 119 FALSE 4 2
45 456 2014-02-10 220 FALSE 4 2
46 456 2014-09-18 4 TRUE 5 3
47 456 2014-09-22 3 TRUE 5 3
48 456 2014-09-25 7 TRUE 5 3
49 456 2014-10-02 6 TRUE 5 3
50 456 2014-10-08 8 TRUE 5 3
51 456 2014-10-16 97 FALSE 6 4
52 456 2015-01-21 15 TRUE 7 5
53 456 2015-02-05 21 TRUE 7 5
54 456 2015-02-26 41 FALSE 8 6
55 456 2015-04-08 57 FALSE 8 6
56 456 2015-06-04 12 TRUE 9 7
57 456 2015-06-16 2 TRUE 9 7
58 456 2015-06-18 49 FALSE 10 8
59 456 2015-08-06 14 TRUE 11 9
60 456 2015-08-20 42 FALSE 12 10
61 456 2015-10-01 12 TRUE 13 11
62 456 2015-10-13 16 TRUE 13 11
63 456 2015-10-29 12 TRUE 13 11
64 456 2015-11-10 65 FALSE 14 12
65 456 2016-01-14 1 TRUE 15 13
66 456 2016-01-15 -830 TRUE 15 13

Edit (3/2/20):

I think if the rule is a date difference of >= 30 days begins a new episode could be easier than the previous approach. See if this works for you:

library(tidyverse)

df %>%
group_by(ID) %>%
mutate(difftime = Date - lag(Date, default = first(Date)),
expected2 = cumsum(difftime >= 30) + 1) %>%
print(n=24)

Output

# A tibble: 24 x 7
# Groups: ID [1]
ID Date days_until_next EpisodeTimeCrit~ expected difftime expected2
<chr> <date> <dbl> <lgl> <dbl> <time> <dbl>
1 456 2013-10-07 7 TRUE 1 0 days 1
2 456 2013-10-14 119 FALSE 1 7 days 1
3 456 2014-02-10 220 FALSE 2 119 days 2
4 456 2014-09-18 4 TRUE 3 220 days 3
5 456 2014-09-22 3 TRUE 3 4 days 3
6 456 2014-09-25 7 TRUE 3 3 days 3
7 456 2014-10-02 6 TRUE 3 7 days 3
8 456 2014-10-08 8 TRUE 3 6 days 3
9 456 2014-10-16 97 FALSE 3 8 days 3
10 456 2015-01-21 15 TRUE 4 97 days 4
11 456 2015-02-05 21 TRUE 4 15 days 4
12 456 2015-02-26 41 FALSE 4 21 days 4
13 456 2015-04-08 57 FALSE 5 41 days 5
14 456 2015-06-04 12 TRUE 6 57 days 6
15 456 2015-06-16 2 TRUE 6 12 days 6
16 456 2015-06-18 49 FALSE 6 2 days 6
17 456 2015-08-06 14 TRUE 7 49 days 7
18 456 2015-08-20 42 FALSE 7 14 days 7
19 456 2015-10-01 12 TRUE 8 42 days 8
20 456 2015-10-13 16 TRUE 8 12 days 8
21 456 2015-10-29 12 TRUE 8 16 days 8
22 456 2015-11-10 65 FALSE 8 12 days 8
23 456 2016-01-14 1 TRUE 9 65 days 9
24 456 2016-01-15 -830 TRUE 9 1 days 9

Generate sequence of dates for given frequency as per days of occurence

Working on larger sample, as discussed earlier in comments. Strategy followed -

  • As your day column always start from Mon which is not equal to start_date so the column matching weekday is required.
  • So Created day field to ordered factor type so that it can be manipulatedit into integers.
  • Arranged the dataframe in such a way that your every group starts from that day only. Used modulo division %% for this
  • After arranging the task was rather easier. I created seven dates for each weekday end, for each group and each start_date.
  • Filtered out rows with Y/N as 0 anywhere.
  • Now you require only top row so used slice_head()
df <- data.frame(
stringsAsFactors = FALSE,
Group = c("foo","foo","foo",
"foo","foo","foo","foo","foo","foo","foo",
"foo","foo","foo","foo","foo","foo","foo",
"foo","foo","foo","foo","bar","bar","bar",
"bar","bar","bar","bar","bar","bar","bar","bar",
"bar","bar","bar"),
start_date = c("02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","04-06-2021",
"04-06-2021","04-06-2021","04-06-2021","04-06-2021",
"04-06-2021","04-06-2021","06-06-2021","06-06-2021",
"06-06-2021","06-06-2021","06-06-2021",
"06-06-2021","06-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","05-06-2021","05-06-2021",
"05-06-2021","05-06-2021","05-06-2021","05-06-2021",
"05-06-2021"),
Day = c("Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed","Thu",
"Fri","Sat","Sun"),
y_n = c(0L,1L,0L,1L,1L,
1L,0L,0L,1L,0L,1L,1L,1L,0L,0L,1L,0L,1L,
1L,1L,0L,1L,0L,0L,1L,1L,0L,0L,1L,0L,
0L,1L,1L,0L,0L)
)

library(lubridate)
library(tidyverse)

df %>% group_by(Group, start_date) %>%
mutate(Day = factor(Day, levels = Day, ordered = T)) %>%
arrange(Group, (as.numeric(Day) + 7 - wday(dmy(start_date), week_start = 1)) %% 7, .by_group = T) %>%
mutate(next_available_date = dmy(start_date) + 0:6) %>%
filter(y_n !=0) %>%
slice_head()
#> # A tibble: 5 x 5
#> # Groups: Group, start_date [5]
#> Group start_date Day y_n next_available_date
#> <chr> <chr> <ord> <int> <date>
#> 1 bar 02-06-2021 Thu 1 2021-06-03
#> 2 bar 05-06-2021 Mon 1 2021-06-07
#> 3 foo 02-06-2021 Thu 1 2021-06-03
#> 4 foo 04-06-2021 Fri 1 2021-06-04
#> 5 foo 06-06-2021 Tue 1 2021-06-08

On the data provided

df <- data.frame(
stringsAsFactors = FALSE,
Group = c("foo","foo","foo",
"foo","foo","foo","foo","bar","bar","bar",
"bar","bar","bar","bar"),
start_date = c("02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021"),
Day = c("Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun"),
y_n = c(0L,1L,0L,1L,1L,
1L,0L,1L,0L,0L,1L,1L,0L,0L)
)

library(lubridate)
library(tidyverse)

df %>% group_by(Group, start_date) %>%
mutate(Day = factor(Day, levels = Day, ordered = T)) %>%
arrange(Group, (as.numeric(Day) + 7 - wday(dmy(start_date), week_start = 1)) %% 7, .by_group = T) %>%
mutate(next_available_date = dmy(start_date) + 0:6) %>%
filter(y_n !=0) %>%
slice_head()

#> # A tibble: 2 x 5
#> # Groups: Group, start_date [2]
#> Group start_date Day y_n next_available_date
#> <chr> <chr> <ord> <int> <date>
#> 1 bar 02-06-2021 Thu 1 2021-06-03
#> 2 foo 02-06-2021 Thu 1 2021-06-03

Created on 2021-06-02 by the reprex package (v2.0.0)

Group consecutive dates [duplicate question, but can't make it work with my data]

Remove time from date time

It's hard to tell exactly what the problem is without seeing your data (or similar example data), but my guess is that the date time format (the 00:00:00 part) is messing up as.Date

One solution would be to extract just the date part and then try again with just the date part:

# here are your date times
date_time <- "2018-01-03 00:00:00"

# this looks for 4 digits between 0 and 9, followed by a dash, followed by 2 digits between 0 and 9,followed by a dash, followed by 2 digits between 0 and 9
date_pattern <- " ?([0-9]{4}-[0-9]{2}-[0-9]{2}) ?"

#need this library
library(stringr)
library(magrittr) #for pipes

#this pulls out text matching the pattern we specified in date pattern
date_new <- str_extract(date_time, date_pattern) %>%
str_squish() # this removes white space

# this is the new date without the time
date_new

# then we convert to as date
date_new <- as.Date(date_new)

See if converting your date column to just dates and then rerunning your grouping works.

If you have dates in different formats and need to adapt the regular expression, here's something about regular expressions: https://stackoverflow.com/a/49286794/16502170

Group dates

Let's start with an example data frame that contains a date column

# here's a bunch of example dates:
library(lubridate)
dates2 <- seq.Date(as.Date("2018-03-01"),by="days",length.out = 60)

#here's the dataframe
exampl_df <- data.frame(animals = rep(c("cats","dogs","rabbits"),20), dates=dates2,
numbers= rep(1:3,20))

Here's what it looks like:

head(exampl_df)
animals dates numbers
1 cats 2018-03-01 1
2 dogs 2018-03-02 2
3 rabbits 2018-03-03 3
4 cats 2018-03-04 1
5 dogs 2018-03-05 2
6 rabbits 2018-03-06 3

Then let's make a sequence of every day between the minimum and maximum date in the sequence. This step is important because there may be missing dates in our data that we still want counting towards the separation between days.

# this is a day by day sequence from the earliest day in your data to the latest day
date_sequence <- seq.Date(from = min(dates2),max(dates2),by="day")

Then let's make a sequence of numbers each repeated seven times. If you wanted to group every three days, you could change each to 3. Then the length.out= length(date_sequence) tells R to make this vector have as many entries as the min to max date sequence has:

# and then if you want a new group every seven days you can make this number sequence
groups <- rep(1:length(date_sequence),each= 7, length.out = length(date_sequence) )

Then let's attach the groups to the date_sequence to make a grouping index

date_grouping_index <- data.frame(a=date_sequence,b=groups)

then you can do a join to attach the groups to the original dataframe

library(dplyr)
example_df 2 <- exampl_df %>%
inner_join(date_grouping_index, by=c("dates"="a"))

This is what we get:

head(example_df2,n=10)
animals dates numbers b
1 cats 2018-03-01 1 1
2 dogs 2018-03-02 2 1
3 rabbits 2018-03-03 3 1
4 cats 2018-03-04 1 1
5 dogs 2018-03-05 2 1
6 rabbits 2018-03-06 3 1
7 cats 2018-03-07 1 1
8 dogs 2018-03-08 2 2
9 rabbits 2018-03-09 3 2
10 cats 2018-03-10 1 2

Then you should be able to group_by() or aggregate() your data using column b

Using the data provided in the question

#original data
df <- structure(list(Date = structure(c(17534, 17535, 17536, 17537,
18279, 18280, 18281, 18282, 17932), class = "Date"), group = c(1,
1, 1, 1, 2, 2, 2, 2, 2)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))

#plus extra step
df$group2 <- 1 + c(0, cumsum(ifelse(diff(df$Date) > 1, 1, 0)))

Method described above

date_sequence <- seq.Date(from = min(df$Date),max(df$Date),by="day")
groups <- rep(1:length(date_sequence),each= 7, length.out = length(date_sequence) )
date_grouping_index <- data.frame(a=date_sequence,groups=groups)

example_df2<- df %>%
inner_join(date_grouping_index, by=c("Date"="a"))

Looks like it worked?

example_df2
# A tibble: 9 x 4
Date group group2 groups
<date> <dbl> <dbl> <int>
1 2018-01-03 1 1 1
2 2018-01-04 1 1 1
3 2018-01-05 1 1 1
4 2018-01-06 1 1 1
5 2020-01-18 2 2 107
6 2020-01-19 2 2 107
7 2020-01-20 2 2 107
8 2020-01-21 2 2 107
9 2019-02-05 2 2 57

Here's something you could do to make group names with the date and year in them:

example_df2$group_name <- paste0("sampling number ",
example_df2$groups,
" (",
month.name[month(example_df2$Date)],
"-",
year(example_df2$Date),
")")

Create sequence of dates in R with pipe %%

We could block with {}

library(stringr)
library(dplyr)
tmp %>%
str_split(., " ", simplify = TRUE) %>%
as.vector() %>%
as.Date %>%
{seq(from = magrittr::extract(., 1),
to = magrittr::extract(., 2), by = "1 day")}


Related Topics



Leave a reply



Submit