R: Expand and Fill Data Frame by Date in Series

R: expand and fill data frame by date in series

First, make sure date is stored as a date object:

demo_df$date <- as.Date(demo_df$date, format = "%Y-%m-%d")

Then using tidyverse, we first complete the sequence, then fill the group down:

library(tidyverse)

demo_df %>% complete(date = seq.Date(min(date), max(date), by = "day")) %>%
fill(igroup)

Expand dataframe with sequential dates based on a column of dates in R

I think this should work fine:

merge(
x = data.frame(
Date = seq.Date(min(df$Date), max(df$Date), by = "day")
),
y = df,
all.x = TRUE
)
# Date Group Draw
# 1 2006-05-11 bb TRUE
# 2 2006-05-11 bb FALSE
# 3 2006-05-12 <NA> NA
# 4 2006-05-13 <NA> NA
# 5 2006-05-14 aa TRUE
# 6 2006-05-15 <NA> NA
# 7 2006-05-16 aa TRUE
# 8 2006-05-17 <NA> NA
# 9 2006-05-18 <NA> NA
# 10 2006-05-19 <NA> NA
# 11 2006-05-20 cc FALSE
# 12 2006-05-20 bb FALSE
# 13 2006-05-21 aa TRUE

All this is doing is creating a date sequence spanning the range of your actual data, and then performing a left join.


And the same idea, using data.table:

dt[dt[,.(Date = seq.Date(min(Date), max(Date), by = "day"))], on = .(Date)]
# Date Group Draw
# 1: 2006-05-11 bb TRUE
# 2: 2006-05-11 bb FALSE
# 3: 2006-05-12 NA NA
# 4: 2006-05-13 NA NA
# 5: 2006-05-14 aa TRUE
# 6: 2006-05-15 NA NA
# 7: 2006-05-16 aa TRUE
# 8: 2006-05-17 NA NA
# 9: 2006-05-18 NA NA
# 10: 2006-05-19 NA NA
# 11: 2006-05-20 cc FALSE
# 12: 2006-05-20 bb FALSE
# 13: 2006-05-21 aa TRUE

zz <- "Date Group Draw
1 2006-05-11 bb T
2 2006-05-11 bb F
3 2006-05-14 aa T
4 2006-05-16 aa T
5 2006-05-20 cc F
6 2006-05-20 bb F
7 2006-05-21 aa T"

df <- read.table(
text = zz,
header = TRUE
)
df$Date <- as.Date(df$Date)

library(data.table)
dt <- data.table(read.table(text = zz, header = TRUE))[,Date := as.Date(Date)]

R - Expanding a value between a sequence of dates and add as columns to data.table

We convert the wide format with pivot_wider, do a group by summarise to create the 'Mean' row by concatenating the other observation with the mean value. With dplyr version >=1.0, summarise can return more than one row per group

library(dplyr)
library(tidyr)
ratings.dt %>%
select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>%
pivot_wider(names_from = RATING_DATE, values_from = RATING) %>%
group_by(ISSUE_ID) %>%
summarise(RATING_TYPE = c(RATING_TYPE, "Mean"),
across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')

-output

# A tibble: 4 x 11
# ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 123 FR 3.33 4 3.66 2 2.33 3 NA
#2 123 MR 2.66 NA 3 3.33 NA NA 2.33
#3 123 SP NA NA NA NA NA NA 3.33
#4 123 Mean 3.00 4 3.33 2.66 2.33 3 2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>


Or using data.table

library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE,
value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
# ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1: 123 FR NA 3.330 4 3.66 2.33 2.000 NA NA
#2: 123 MR 2.33 2.660 NA 3.00 NA 3.330 NA 3
#3: 123 SP 3.33 NA NA NA NA NA 3 NA
#4: 123 Mean 2.83 2.995 4 3.33 2.33 2.665 3 3
# 2004-01-31
#1: 3
#2: NA
#3: NA
#4: 3

data

ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L, 
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR",
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP",
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33,
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31",
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31",
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31",
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31",
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))

Expand rows by date range using start and end date

Using data.table:

require(data.table) ## 1.9.2+
setDT(df)[ , list(idnum = idnum, month = seq(start, end, by = "month")), by = 1:nrow(df)]

# you may use dot notation as a shorthand alias of list in j:
setDT(df)[ , .(idnum = idnum, month = seq(start, end, by = "month")), by = 1:nrow(df)]

setDT converts df to a data.table. Then for each row, by = 1:nrow(df), we create idnum and month as required.

R Expand time series data based on start and end point

Using a tidyverse solution could look like:

library(dplyr)
library(stringr)
library(purrr)
library(tidyr)

data <- tibble(place = c(1, 2),
start_date = c('2007-09-01',
'2013-09-27'),
end_date = c('2010-10-12',
'2015-10-11'),
value = c(0.5, 0.7))

data %>%
mutate(year = map2(start_date,
end_date,
~ as.character(str_extract(.x, '\\d{4}'):
str_extract(.y, '\\d{4}')))) %>%
separate_rows(year) %>%
filter(!year %in% c('c', '')) %>%
select(place, year, value)

# place year value
# <dbl> <chr> <dbl>
# 1 1 2007 0.5
# 2 1 2008 0.5
# 3 1 2009 0.5
# 4 1 2010 0.5
# 5 2 2013 0.7
# 6 2 2014 0.7
# 7 2 2015 0.7

I'm having problems understanding the third paragraph of your question ("There are ..."). It seems to me to be a separate question. If that is the case, please consider moving the question to a separate post here on SO. If it is not a separate question, please reformulate the paragraph.

Fill dates by groups in a data frame R

Assuming that your dates are actually in date format rather than character format (we can't tell from the table in the question), and assuming that the 7th row has the wrong year in it (2021 as opposed to 2022), you can do:

library(tidyverse)

df %>%
split(.$name) %>%
lapply(function(x) {
complete(x, expand(x, date = seq(min(x$date), max(x$date), by = 'day')),
fill = list(name = x$name[1], code = x$code[1]))}) %>%
bind_rows()
#> # A tibble: 16 x 5
#> date name code usage result
#> <date> <chr> <int> <int> <int>
#> 1 2021-11-04 Jennifer Aniston 23211 345 1
#> 2 2021-11-05 Jennifer Aniston 23211 260 1
#> 3 2021-11-06 Jennifer Aniston 23211 230 0
#> 4 2021-11-07 Jennifer Aniston 23211 0 0
#> 5 2022-01-01 Lisa Kudrow 55120 132 0
#> 6 2022-01-02 Lisa Kudrow 55120 125 0
#> 7 2022-01-03 Lisa Kudrow 55120 NA NA
#> 8 2022-01-04 Lisa Kudrow 55120 345 1
#> 9 2022-01-05 Lisa Kudrow 55120 NA NA
#> 10 2022-01-06 Lisa Kudrow 55120 321 1
#> 11 2022-01-07 Lisa Kudrow 55120 431 1
#> 12 2022-10-01 Matthew Perry 44215 312 1
#> 13 2022-10-02 Matthew Perry 44215 NA NA
#> 14 2022-10-03 Matthew Perry 44215 NA NA
#> 15 2022-10-04 Matthew Perry 44215 230 0
#> 16 2022-10-05 Matthew Perry 44215 232 0

Created on 2022-06-02 by the reprex package (v2.0.1)


Data taken from question in reproducible format

df <- structure(list(name = c("Jennifer Aniston", "Jennifer Aniston", 
"Jennifer Aniston", "Jennifer Aniston", "Matthew Perry", "Matthew Perry",
"Matthew Perry", "Lisa Kudrow", "Lisa Kudrow", "Lisa Kudrow",
"Lisa Kudrow", "Lisa Kudrow"), code = c(23211L, 23211L, 23211L,
23211L, 44215L, 44215L, 44215L, 55120L, 55120L, 55120L, 55120L,
55120L), date = structure(c(18935, 18936, 18937, 18938, 19266,
19269, 19270, 18993, 18994, 18996, 18998, 18999), class = "Date"),
usage = c(345L, 260L, 230L, 0L, 312L, 230L, 232L, 132L, 125L,
345L, 321L, 431L), result = c(1L, 1L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 1L, 1L)), row.names = c(NA, -12L), class = "data.frame")

expand a series, then fill remaining columns

library(dplyr)
library(tidyr)

df %>%
group_by(group) %>%
complete(val = min(val):max(val))

# # A tibble: 10 x 2
# # Groups: group [2]
# group val
# <fct> <dbl>
# 1 A 1
# 2 A 2
# 3 A 3
# 4 A 4
# 5 A 5
# 6 B 1
# 7 B 2
# 8 B 3
# 9 B 4
# 10 B 5

Extend a dataframe based on the last known value

Here is a base R idea,

rbind(df, setNames(data.frame(x = seq(6, 10), y = df$y[nrow(df)]), names(df)))

# x y
#1 1 0.4
#2 2 0.6
#3 3 0.2
#4 4 0.2
#5 5 0.1
#6 6 0.1
#7 7 0.1
#8 8 0.1
#9 9 0.1
#10 10 0.1

How to fill dates between two dates

Here, we may need to use by as sequence of rows

library(data.table)
setDT(df)[, .(date = seq(as.Date(start_date), as.Date(end_date),
by = 'day')), .(rn = seq_len(nrow(df)), name, value)][, rn := NULL][]

Or create a list column by looping over corresponding elements of 'start_date', 'end_date' to create a sequence of dates in Map and then unnest the list

library(tidyr)
library(magrittr)
setDT(df)[, .(name, date = Map(seq, MoreArgs = list(by = '1 day'),
as.Date(start_date), as.Date(end_date)), value)] %>%
unnest(date)
# A tibble: 731 x 3
# name date value
# <chr> <date> <dbl>
# 1 A 2020-01-23 8.1
# 2 A 2020-01-24 8.1
# 3 A 2020-01-25 8.1
# 4 A 2020-01-26 8.1
# 5 A 2020-01-27 8.1
# 6 A 2020-01-28 8.1
# 7 A 2020-01-29 8.1
# 8 A 2020-01-30 8.1
# 9 A 2020-01-31 8.1
#10 A 2020-02-01 8.1
# … with 721 more rows


Related Topics



Leave a reply



Submit