R: Find and Add Missing (/Non Existing) Rows in Time Related Data Frame

R: Find and add missing (/non existing) rows in time related data frame

Use expand.grid and merge:

vals <- expand.grid(YearWeek = unique(test$YearWeek),
                    ProductID = unique(test$ProductID),
                    CustomerID = unique(test$CustomerID))
> merge(vals,test,all = TRUE)
  YearWeek ProductID CustomerID Quantity
1  2012-01         1          a        5
2  2012-01         1          b        7
3  2012-01         2          a        6
4  2012-01         2          b        8
5  2012-02         1          a        9
6  2012-02         1          b       11
7  2012-02         2          a       10
8  2012-02         2          b       NA

The NAs can be replaced after the fact with whatever values you choose using subsetting and is.na.

R Add Missing Rows by Condition

dplyr/tidyr

library(dplyr)
library(tidyr)
dat %>%
  complete(Species, Date, Site, fill = list(n = 0))
# # A tibble: 15 x 4
#    Species Date      Site      n
#    <chr>   <chr>     <chr> <dbl>
#  1 AMCR    6/1/2021  BMA       1
#  2 AMCR    6/1/2021  SVA       0
#  3 AMCR    6/1/2021  SVC      14
#  4 AMCR    6/15/2021 BMA       0
#  5 AMCR    6/15/2021 SVA       9
#  6 AMCR    6/15/2021 SVC       0
#  7 AMCR    6/21/2021 BMA       0
#  8 AMCR    6/21/2021 SVA      18
#  9 AMCR    6/21/2021 SVC       0
# 10 AMCR    6/29/2021 BMA       0
# 11 AMCR    6/29/2021 SVA      18
# 12 AMCR    6/29/2021 SVC       0
# 13 AMCR    6/7/2021  BMA       0
# 14 AMCR    6/7/2021  SVA       2
# 15 AMCR    6/7/2021  SVC       0

base R

dat2 <- merge(dat, do.call(expand.grid, lapply(dat[,1:3], unique)), by = names(dat)[1:3], all = TRUE)
dat2
#    Species      Date Site  n
# 1     AMCR  6/1/2021  BMA  1
# 2     AMCR  6/1/2021  SVA NA
# 3     AMCR  6/1/2021  SVC 14
# 4     AMCR 6/15/2021  BMA NA
# 5     AMCR 6/15/2021  SVA  9
# 6     AMCR 6/15/2021  SVC NA
# 7     AMCR 6/21/2021  BMA NA
# 8     AMCR 6/21/2021  SVA 18
# 9     AMCR 6/21/2021  SVC NA
# 10    AMCR 6/29/2021  BMA NA
# 11    AMCR 6/29/2021  SVA 18
# 12    AMCR 6/29/2021  SVC NA
# 13    AMCR  6/7/2021  BMA NA
# 14    AMCR  6/7/2021  SVA  2
# 15    AMCR  6/7/2021  SVC NA
dat2$n <- ifelse(is.na(dat2$n), 0, dat2$n)
dat2
#    Species      Date Site  n
# 1     AMCR  6/1/2021  BMA  1
# 2     AMCR  6/1/2021  SVA  0
# 3     AMCR  6/1/2021  SVC 14
# 4     AMCR 6/15/2021  BMA  0
# 5     AMCR 6/15/2021  SVA  9
# 6     AMCR 6/15/2021  SVC  0
# 7     AMCR 6/21/2021  BMA  0
# 8     AMCR 6/21/2021  SVA 18
# 9     AMCR 6/21/2021  SVC  0
# 10    AMCR 6/29/2021  BMA  0
# 11    AMCR 6/29/2021  SVA 18
# 12    AMCR 6/29/2021  SVC  0
# 13    AMCR  6/7/2021  BMA  0
# 14    AMCR  6/7/2021  SVA  2
# 15    AMCR  6/7/2021  SVC  0

Data

dat <- structure(list(Species = c("AMCR", "AMCR", "AMCR", "AMCR", "AMCR", "AMCR"), Date = c("6/1/2021", "6/1/2021", "6/7/2021", "6/15/2021", "6/21/2021", "6/29/2021"), Site = c("SVC", "BMA", "SVA", "SVA", "SVA", "SVA"), n = c(14L, 1L, 2L, 9L, 18L, 18L)), class = "data.frame", row.names = c(NA, -6L))

How to fill in missing rows by extending last available row?

I will give you a simple pure tidyverse solution. First, you have to expand grid by combinations of the variables, you can use expand() or complete() to make implicit missing values explicitly missing. Then you want to do LOCF (last observation carried forward), this can be achieved either by the fill argument inside complete() or by using fill() function. All there functions are inside the tidyr package.

library(tidyverse)

data <- data.frame(
  ID = c(1,1,2,2,2,2),
  Period_1 = c("2020-03", "2020-04", "2020-01", "2020-02", "2020-03", "2020-04"),
  Period_2 = c(1, 2, 1, 2, 3, 4),
  ColA = c(10, 20, 30, 40, 50, 52),
  ColB = c(15, 25, 35, 45, 55, 87)
)

data %>%
  tidyr::complete(ID, nesting(Period_2)) %>%
  tidyr::fill(ColA, ColB, .direction = "down")
#> # A tibble: 8 x 5
#>      ID Period_2 Period_1  ColA  ColB
#>   <dbl>    <dbl> <chr>    <dbl> <dbl>
#> 1     1        1 2020-03     10    15
#> 2     1        2 2020-04     20    25
#> 3     1        3 <NA>        20    25
#> 4     1        4 <NA>        20    25
#> 5     2        1 2020-01     30    35
#> 6     2        2 2020-02     40    45
#> 7     2        3 2020-03     50    55
#> 8     2        4 2020-04     52    87

^{Created on 2022-01-21 by the reprex package (v2.0.1)}

In R: is there an elegant way to split data.frame row by , and add to existing rows matching the splitted strings?

We can use separate_rows to split the column, then do a group by operation to get the sum

library(dplyr)
library(tidyr)
df %>%
   separate_rows(sample_names) %>% 
   group_by(sample_names) %>% 
   summarise(sample_values = sum(sample_values), .groups = 'drop')

-output

# A tibble: 2 x 2
#  sample_names sample_values
#  <chr>                <dbl>
#1 bar                      8
#2 foo                      4

Or with base R by splitting the column with strsplit into a list of vectors, then use tapply to do a group by sum

lst1 <- strsplit(df$sample_names, ",\\s+")
tapply(rep(df$sample_values, lengths(lst1)), unlist(lst1), FUN = sum)

Detect missing (non existing) rows within a dataframe and replace them with NA

library(lubridate)
library(dplyr)

Set up sample data:

dat = read.table(text="Station Date        Day
7002    17/12/1966  77
                 7002    05/05/1968  582
                 7002    30/10/1968  760
                 7002    16/08/1970  1415    
                 7003    02/12/1966  62
                 7003    05/05/1968  582
                 7003    31/10/1968  761
                 8004    04/07/1968  4294
                 8004    15/11/1968  4428
                 8006    13/10/1966  5856
                 8006    23/09/1967  6567
                 8006    01/09/1968  6910", header=TRUE, stringsAsFactors=FALSE)

dat$Date = as.Date(dat$Date, format=c("%d/%m/%Y"))

Add water year: I've assumed that the water year is named by the year of the start of the water year. For example, water year 01/10/1967 - 30/09/1968 is water year 1967.

dat$water.year = ifelse(month(dat$Date) %in% 1:9, year(dat$Date) - 1, year(dat$Date))

Add rows for missing years: I do this by merging with a new data frame that includes all combinations of Station and water.year.

full_join(expand.grid(Station=unique(dat$Station), water.year=1966:1969),
          dat,
          by=c("Station","water.year")) %>% arrange(Station, water.year)

   Station water.year       Date  Day
1     7002       1966 1966-12-17   77
2     7002       1967 1968-05-05  582
3     7002       1968 1968-10-30  760
4     7002       1969 1970-08-16 1415
5     7003       1966 1966-12-02   62
6     7003       1967 1968-05-05  582
7     7003       1968 1968-10-31  761
8     7003       1969       <NA>   NA
9     8004       1966       <NA>   NA
10    8004       1967 1968-07-04 4294
11    8004       1968 1968-11-15 4428
12    8004       1969       <NA>   NA
13    8006       1966 1966-10-13 5856
14    8006       1966 1967-09-23 6567
15    8006       1967 1968-09-01 6910
16    8006       1968       <NA>   NA
17    8006       1969       <NA>   NA

R: Find and Add Missing (/Non Existing) Rows in Time Related Data Frame