creating sequence of dates for each group in r
You could use data.table
to get the sequence
of Dates from 'created_at' to '2015-07-12', grouped by the 'ID' column.
library(data.table)
setDT(df1)[, list(date=seq(created_at, as.Date('2015-07-12'), by='1 day')) , ID]
If you need an option with dplyr
, use do
library(dplyr)
df1 %>%
group_by(ID) %>%
do( data.frame(., Date= seq(.$created_at,
as.Date('2015-07-12'), by = '1 day')))
If you have duplicate IDs, then we may need to group by row_number()
df1 %>%
group_by(rn=row_number()) %>%
do(data.frame(ID= .$ID, Date= seq(.$created_at,
as.Date('2015-07-12'), by = '1 day'), stringsAsFactors=FALSE))
Update
Based on @Frank's commment, the new idiom for tidyverse
is
library(tidyverse)
df1 %>%
group_by(ID) %>%
mutate(d = list(seq(created_at, as.Date('2015-07-12'), by='1 day')), created_at = NULL) %>%
unnest()
In the case of data.table
setDT(df1)[, list(date=seq(created_at,
as.Date('2015-07-12'), by = '1 day')), by = 1:nrow(df1)]
data
df1 <- structure(list(ID = c("MUM-0001", "MUM-0002", "MUM-0003",
"MUM-0004",
"MUM-0005", "MUM-0006"), created_at = structure(c(16176, 16084,
16177, 16172, 16178, 16177), class = "Date")), .Names = c("ID",
"created_at"), row.names = c(NA, -6L), class = "data.frame")
Creating sequence of dates in R by group, dependent on another variable
If there are more than one row, the seq
needs to be looped. We can use map2
. Also, based on the format
of the 'DATE' columns, the as.Date
needs a format
argument i.e. as.Date(ADATE, "%m/%d/%y")
(assuming it is month/day/year format)
library(dplyr)
library(purrr)
library(lubridate)
chf_jan15 %>%
mutate_at(vars(ends_with("DATE")), mdy) %>%
mutate(random_date = map2(ADATE, DDATE, seq, by = "day")) %>%
unnest(c(random_date))
# A tibble: 4 x 4
# ID ADATE DDATE random_date
# <int> <date> <date> <date>
#1 1 2010-02-04 2010-02-07 2010-02-04
#2 1 2010-02-04 2010-02-07 2010-02-05
#3 1 2010-02-04 2010-02-07 2010-02-06
#4 1 2010-02-04 2010-02-07 2010-02-07
If there is only a single row, after converting to Date
class, the complete
should work
library(tidyr)
chf_jan15 %>%
mutate_at(vars(ends_with("DATE")), as.Date, format = "%m/%d/%y") %>%
mutate(NEW_DATE = ADATE) %>%
complete(NEW_DATE = seq(ADATE, DDATE, by = 'day')) %>%
fill(c(ID, ADATE, DDATE))
# A tibble: 4 x 4
# NEW_DATE ID ADATE DDATE
# <date> <int> <date> <date>
#1 2010-02-04 1 2010-02-04 2010-02-07
#2 2010-02-05 1 2010-02-04 2010-02-07
#3 2010-02-06 1 2010-02-04 2010-02-07
#4 2010-02-07 1 2010-02-04 2010-02-07
If there is a single row for each each 'ID', then we can group_split
and use complete
chf_jan15 %>%
mutate_at(vars(ends_with("DATE")), as.Date, format = "%m/%d/%y") %>%
mutate(NEW_DATE = ADATE) %>%
group_split(ID) %>%
map_dfr(~ .x %>%
complete(NEW_DATE = seq(ADATE, DDATE, by = 'day')) %>%
fill(c(ID, ADATE, DDATE)))
data
chf_jan15 <- structure(list(ID = 1L, ADATE = "02/04/10",
DDATE = "02/07/10"), class = "data.frame", row.names = c(NA,
-1L))
Sequence a group of dates in R
You can try this
rep(seq(as.Date("2020-01-01"),as.Date("2020-01-10"),1),each=26)
This will return a list of dates from 2020-01-01 to 2020-01-10 where each date will be repeated 26 times
How to generate sequence of dates by group
We group by sequence of rows as there are duplicate elements for the "ID", "Tag" group.
dt[, list(Date = seq(Begin, length.out=3, by = '1 month'), x,y,z), by = 1:nrow(dt)]
Or as @David Arenburg mentioned, we replicate the rows by "N" and then group by "ID", "Tag" by selecting only the first observation of "Begin"
dt[rep(1:.N, each = 3)][, Begin := seq(Begin[1L],
length.out=3, by = '1 month'), by = .(ID, Tag)][]
group_by and create a sequence of monthly dates
If we need to create a sequence of dates for each 'start_date' and its corresponding 'end_date', it can be done with map2
and here it doesn't need any grouping as it gets the sequence from each corresponding 'start_date/end_date'
library(purrr)
df %>%
mutate(Seq = map2(start_date, end_date, seq, by = '1 day'))
Update
Based on the OP's comments
df %>%
group_by(cusip) %>%
mutate(rn = row_number()) %>%
filter(cummax(date >= start_date & date <= end_date) > 0)
# A tibble: 102 x 5
# Groups: cusip [1]
# cusip date start_date end_date rn
# <chr> <date> <date> <date> <int>
# 1 00036020 2011-07-29 2011-07-29 2012-06-30 7
# 2 00036020 2011-08-31 2011-07-29 2012-06-30 8
# 3 00036020 2011-09-30 2011-07-29 2012-06-30 9
# 4 00036020 2011-10-31 2011-07-29 2012-06-30 10
# 5 00036020 2011-11-30 2011-07-29 2012-06-30 11
# 6 00036020 2011-12-30 2011-07-29 2012-06-30 12
# 7 00036020 2012-01-31 2012-07-31 2013-06-30 13
# 8 00036020 2012-02-29 2012-07-31 2013-06-30 14
# 9 00036020 2012-03-30 2012-07-31 2013-06-30 15
#10 00036020 2012-04-30 2012-07-31 2013-06-30 16
# … with 92 more rows
-checking the first 24 rows
Creating Episodes for Groups based on Date Sequence
Here might be one approach. I added the newer output with group_by
as Episode2
next to Episode
created from the initial example. Hope this is helpful.
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(
Episode2 = {
r <- rle(EpisodeTimeCriterian)
r$values <- cumsum(rep(1, length(r$values)))
inverse.rle(r)
}
) %>%
print(n=66)
Output
# A tibble: 66 x 6
# Groups: ID [2]
ID Date days_until_next EpisodeTimeCriterian Episode Episode2
<chr> <date> <dbl> <lgl> <int> <dbl>
1 123 2013-10-08 7 TRUE 1 1
2 123 2013-10-15 7 TRUE 1 1
3 123 2013-10-22 7 TRUE 1 1
4 123 2013-10-29 7 TRUE 1 1
5 123 2013-11-05 7 TRUE 1 1
6 123 2013-11-12 7 TRUE 1 1
7 123 2013-11-19 7 TRUE 1 1
8 123 2013-11-26 7 TRUE 1 1
9 123 2013-12-03 14 TRUE 1 1
10 123 2013-12-17 10 TRUE 1 1
11 123 2013-12-27 11 TRUE 1 1
12 123 2014-01-07 7 TRUE 1 1
13 123 2014-01-14 7 TRUE 1 1
14 123 2014-01-21 2 TRUE 1 1
15 123 2014-01-23 1 TRUE 1 1
16 123 2014-01-24 4 TRUE 1 1
17 123 2014-01-28 7 TRUE 1 1
18 123 2014-02-04 1 TRUE 1 1
19 123 2014-02-05 27 TRUE 1 1
20 123 2014-03-04 1997 FALSE 2 2
21 123 2019-08-22 7 TRUE 3 3
22 123 2019-08-29 2 TRUE 3 3
23 123 2019-08-31 5 TRUE 3 3
24 123 2019-09-05 7 TRUE 3 3
25 123 2019-09-12 13 TRUE 3 3
26 123 2019-09-25 12 TRUE 3 3
27 123 2019-10-07 14 TRUE 3 3
28 123 2019-10-21 7 TRUE 3 3
29 123 2019-10-28 7 TRUE 3 3
30 123 2019-11-04 7 TRUE 3 3
31 123 2019-11-11 7 TRUE 3 3
32 123 2019-11-18 7 TRUE 3 3
33 123 2019-11-25 7 TRUE 3 3
34 123 2019-12-02 7 TRUE 3 3
35 123 2019-12-09 7 TRUE 3 3
36 123 2019-12-16 7 TRUE 3 3
37 123 2019-12-23 1 TRUE 3 3
38 123 2019-12-24 13 TRUE 3 3
39 123 2020-01-06 7 TRUE 3 3
40 123 2020-01-13 7 TRUE 3 3
41 123 2020-01-20 7 TRUE 3 3
42 123 2020-01-27 -2302 TRUE 3 3
43 456 2013-10-07 7 TRUE 3 1
44 456 2013-10-14 119 FALSE 4 2
45 456 2014-02-10 220 FALSE 4 2
46 456 2014-09-18 4 TRUE 5 3
47 456 2014-09-22 3 TRUE 5 3
48 456 2014-09-25 7 TRUE 5 3
49 456 2014-10-02 6 TRUE 5 3
50 456 2014-10-08 8 TRUE 5 3
51 456 2014-10-16 97 FALSE 6 4
52 456 2015-01-21 15 TRUE 7 5
53 456 2015-02-05 21 TRUE 7 5
54 456 2015-02-26 41 FALSE 8 6
55 456 2015-04-08 57 FALSE 8 6
56 456 2015-06-04 12 TRUE 9 7
57 456 2015-06-16 2 TRUE 9 7
58 456 2015-06-18 49 FALSE 10 8
59 456 2015-08-06 14 TRUE 11 9
60 456 2015-08-20 42 FALSE 12 10
61 456 2015-10-01 12 TRUE 13 11
62 456 2015-10-13 16 TRUE 13 11
63 456 2015-10-29 12 TRUE 13 11
64 456 2015-11-10 65 FALSE 14 12
65 456 2016-01-14 1 TRUE 15 13
66 456 2016-01-15 -830 TRUE 15 13
Edit (3/2/20):
I think if the rule is a date difference of >= 30 days begins a new episode could be easier than the previous approach. See if this works for you:
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(difftime = Date - lag(Date, default = first(Date)),
expected2 = cumsum(difftime >= 30) + 1) %>%
print(n=24)
Output
# A tibble: 24 x 7
# Groups: ID [1]
ID Date days_until_next EpisodeTimeCrit~ expected difftime expected2
<chr> <date> <dbl> <lgl> <dbl> <time> <dbl>
1 456 2013-10-07 7 TRUE 1 0 days 1
2 456 2013-10-14 119 FALSE 1 7 days 1
3 456 2014-02-10 220 FALSE 2 119 days 2
4 456 2014-09-18 4 TRUE 3 220 days 3
5 456 2014-09-22 3 TRUE 3 4 days 3
6 456 2014-09-25 7 TRUE 3 3 days 3
7 456 2014-10-02 6 TRUE 3 7 days 3
8 456 2014-10-08 8 TRUE 3 6 days 3
9 456 2014-10-16 97 FALSE 3 8 days 3
10 456 2015-01-21 15 TRUE 4 97 days 4
11 456 2015-02-05 21 TRUE 4 15 days 4
12 456 2015-02-26 41 FALSE 4 21 days 4
13 456 2015-04-08 57 FALSE 5 41 days 5
14 456 2015-06-04 12 TRUE 6 57 days 6
15 456 2015-06-16 2 TRUE 6 12 days 6
16 456 2015-06-18 49 FALSE 6 2 days 6
17 456 2015-08-06 14 TRUE 7 49 days 7
18 456 2015-08-20 42 FALSE 7 14 days 7
19 456 2015-10-01 12 TRUE 8 42 days 8
20 456 2015-10-13 16 TRUE 8 12 days 8
21 456 2015-10-29 12 TRUE 8 16 days 8
22 456 2015-11-10 65 FALSE 8 12 days 8
23 456 2016-01-14 1 TRUE 9 65 days 9
24 456 2016-01-15 -830 TRUE 9 1 days 9
Generate sequence of dates for given frequency as per days of occurence
Working on larger sample, as discussed earlier in comments. Strategy followed -
- As your
day
column always start fromMon
which is not equal tostart_date
so the column matchingweekday
is required. - So Created
day
field to orderedfactor
type so that it can be manipulatedit into integers. - Arranged the dataframe in such a way that your every group starts from that day only. Used modulo division
%%
for this - After arranging the task was rather easier. I created seven dates for each weekday end, for each group and each start_date.
- Filtered out rows with
Y/N
as 0 anywhere. - Now you require only top row so used
slice_head()
df <- data.frame(
stringsAsFactors = FALSE,
Group = c("foo","foo","foo",
"foo","foo","foo","foo","foo","foo","foo",
"foo","foo","foo","foo","foo","foo","foo",
"foo","foo","foo","foo","bar","bar","bar",
"bar","bar","bar","bar","bar","bar","bar","bar",
"bar","bar","bar"),
start_date = c("02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","04-06-2021",
"04-06-2021","04-06-2021","04-06-2021","04-06-2021",
"04-06-2021","04-06-2021","06-06-2021","06-06-2021",
"06-06-2021","06-06-2021","06-06-2021",
"06-06-2021","06-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","05-06-2021","05-06-2021",
"05-06-2021","05-06-2021","05-06-2021","05-06-2021",
"05-06-2021"),
Day = c("Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed","Thu",
"Fri","Sat","Sun"),
y_n = c(0L,1L,0L,1L,1L,
1L,0L,0L,1L,0L,1L,1L,1L,0L,0L,1L,0L,1L,
1L,1L,0L,1L,0L,0L,1L,1L,0L,0L,1L,0L,
0L,1L,1L,0L,0L)
)
library(lubridate)
library(tidyverse)
df %>% group_by(Group, start_date) %>%
mutate(Day = factor(Day, levels = Day, ordered = T)) %>%
arrange(Group, (as.numeric(Day) + 7 - wday(dmy(start_date), week_start = 1)) %% 7, .by_group = T) %>%
mutate(next_available_date = dmy(start_date) + 0:6) %>%
filter(y_n !=0) %>%
slice_head()
#> # A tibble: 5 x 5
#> # Groups: Group, start_date [5]
#> Group start_date Day y_n next_available_date
#> <chr> <chr> <ord> <int> <date>
#> 1 bar 02-06-2021 Thu 1 2021-06-03
#> 2 bar 05-06-2021 Mon 1 2021-06-07
#> 3 foo 02-06-2021 Thu 1 2021-06-03
#> 4 foo 04-06-2021 Fri 1 2021-06-04
#> 5 foo 06-06-2021 Tue 1 2021-06-08
On the data provided
df <- data.frame(
stringsAsFactors = FALSE,
Group = c("foo","foo","foo",
"foo","foo","foo","foo","bar","bar","bar",
"bar","bar","bar","bar"),
start_date = c("02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021","02-06-2021","02-06-2021",
"02-06-2021","02-06-2021"),
Day = c("Mon","Tue","Wed",
"Thu","Fri","Sat","Sun","Mon","Tue","Wed",
"Thu","Fri","Sat","Sun"),
y_n = c(0L,1L,0L,1L,1L,
1L,0L,1L,0L,0L,1L,1L,0L,0L)
)
library(lubridate)
library(tidyverse)
df %>% group_by(Group, start_date) %>%
mutate(Day = factor(Day, levels = Day, ordered = T)) %>%
arrange(Group, (as.numeric(Day) + 7 - wday(dmy(start_date), week_start = 1)) %% 7, .by_group = T) %>%
mutate(next_available_date = dmy(start_date) + 0:6) %>%
filter(y_n !=0) %>%
slice_head()
#> # A tibble: 2 x 5
#> # Groups: Group, start_date [2]
#> Group start_date Day y_n next_available_date
#> <chr> <chr> <ord> <int> <date>
#> 1 bar 02-06-2021 Thu 1 2021-06-03
#> 2 foo 02-06-2021 Thu 1 2021-06-03
Created on 2021-06-02 by the reprex package (v2.0.0)
Group consecutive dates [duplicate question, but can't make it work with my data]
Remove time from date time
It's hard to tell exactly what the problem is without seeing your data (or similar example data), but my guess is that the date time format (the 00:00:00 part) is messing up as.Date
One solution would be to extract just the date part and then try again with just the date part:
# here are your date times
date_time <- "2018-01-03 00:00:00"
# this looks for 4 digits between 0 and 9, followed by a dash, followed by 2 digits between 0 and 9,followed by a dash, followed by 2 digits between 0 and 9
date_pattern <- " ?([0-9]{4}-[0-9]{2}-[0-9]{2}) ?"
#need this library
library(stringr)
library(magrittr) #for pipes
#this pulls out text matching the pattern we specified in date pattern
date_new <- str_extract(date_time, date_pattern) %>%
str_squish() # this removes white space
# this is the new date without the time
date_new
# then we convert to as date
date_new <- as.Date(date_new)
See if converting your date column to just dates and then rerunning your grouping works.
If you have dates in different formats and need to adapt the regular expression, here's something about regular expressions: https://stackoverflow.com/a/49286794/16502170
Group dates
Let's start with an example data frame that contains a date column
# here's a bunch of example dates:
library(lubridate)
dates2 <- seq.Date(as.Date("2018-03-01"),by="days",length.out = 60)
#here's the dataframe
exampl_df <- data.frame(animals = rep(c("cats","dogs","rabbits"),20), dates=dates2,
numbers= rep(1:3,20))
Here's what it looks like:
head(exampl_df)
animals dates numbers
1 cats 2018-03-01 1
2 dogs 2018-03-02 2
3 rabbits 2018-03-03 3
4 cats 2018-03-04 1
5 dogs 2018-03-05 2
6 rabbits 2018-03-06 3
Then let's make a sequence of every day between the minimum and maximum date in the sequence. This step is important because there may be missing dates in our data that we still want counting towards the separation between days.
# this is a day by day sequence from the earliest day in your data to the latest day
date_sequence <- seq.Date(from = min(dates2),max(dates2),by="day")
Then let's make a sequence of numbers each repeated seven times. If you wanted to group every three days, you could change each to 3. Then the length.out= length(date_sequence) tells R to make this vector have as many entries as the min to max date sequence has:
# and then if you want a new group every seven days you can make this number sequence
groups <- rep(1:length(date_sequence),each= 7, length.out = length(date_sequence) )
Then let's attach the groups to the date_sequence to make a grouping index
date_grouping_index <- data.frame(a=date_sequence,b=groups)
then you can do a join to attach the groups to the original dataframe
library(dplyr)
example_df 2 <- exampl_df %>%
inner_join(date_grouping_index, by=c("dates"="a"))
This is what we get:
head(example_df2,n=10)
animals dates numbers b
1 cats 2018-03-01 1 1
2 dogs 2018-03-02 2 1
3 rabbits 2018-03-03 3 1
4 cats 2018-03-04 1 1
5 dogs 2018-03-05 2 1
6 rabbits 2018-03-06 3 1
7 cats 2018-03-07 1 1
8 dogs 2018-03-08 2 2
9 rabbits 2018-03-09 3 2
10 cats 2018-03-10 1 2
Then you should be able to group_by()
or aggregate()
your data using column b
Using the data provided in the question
#original data
df <- structure(list(Date = structure(c(17534, 17535, 17536, 17537,
18279, 18280, 18281, 18282, 17932), class = "Date"), group = c(1,
1, 1, 1, 2, 2, 2, 2, 2)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
#plus extra step
df$group2 <- 1 + c(0, cumsum(ifelse(diff(df$Date) > 1, 1, 0)))
Method described above
date_sequence <- seq.Date(from = min(df$Date),max(df$Date),by="day")
groups <- rep(1:length(date_sequence),each= 7, length.out = length(date_sequence) )
date_grouping_index <- data.frame(a=date_sequence,groups=groups)
example_df2<- df %>%
inner_join(date_grouping_index, by=c("Date"="a"))
Looks like it worked?
example_df2
# A tibble: 9 x 4
Date group group2 groups
<date> <dbl> <dbl> <int>
1 2018-01-03 1 1 1
2 2018-01-04 1 1 1
3 2018-01-05 1 1 1
4 2018-01-06 1 1 1
5 2020-01-18 2 2 107
6 2020-01-19 2 2 107
7 2020-01-20 2 2 107
8 2020-01-21 2 2 107
9 2019-02-05 2 2 57
Here's something you could do to make group names with the date and year in them:
example_df2$group_name <- paste0("sampling number ",
example_df2$groups,
" (",
month.name[month(example_df2$Date)],
"-",
year(example_df2$Date),
")")
Create sequence of dates in R with pipe %%
We could block with {}
library(stringr)
library(dplyr)
tmp %>%
str_split(., " ", simplify = TRUE) %>%
as.vector() %>%
as.Date %>%
{seq(from = magrittr::extract(., 1),
to = magrittr::extract(., 2), by = "1 day")}
Related Topics
Extract English Words from a Text in R
Solve Homogenous System Ax = 0 for Any M * N Matrix a in R (Find Null Space Basis for A)
R Programming: Read.Csv() Skips Lines Unexpectedly
Grouped Bar Graph Custom Colours
How to Merge Two Data Frame Based on Partial String Match with R
R: How to Create Grid-Graphics
Reshape Data from Long to Wide Format - More Than One Variable
Using Italic() with a Variable in Ggplot2 Title Expression
Bar Plot for Count Data by Group in R
Separate a Column into Multiple Columns Using Tidyr::Separate with Sep=""
Calculating the Distance Between Points in Different Data Frames
R Function That Uses Its Output as Its Own Input Repeatedly
How to Use Geom_Rect with Discrete Axis Values
Downgrade R Version (No Issues with Bioconductor Installation)