R: Find and Add Missing (/Non Existing) Rows in Time Related Data Frame

R: Find and add missing (/non existing) rows in time related data frame

Use expand.grid and merge:

vals <- expand.grid(YearWeek = unique(test$YearWeek),
ProductID = unique(test$ProductID),
CustomerID = unique(test$CustomerID))
> merge(vals,test,all = TRUE)
YearWeek ProductID CustomerID Quantity
1 2012-01 1 a 5
2 2012-01 1 b 7
3 2012-01 2 a 6
4 2012-01 2 b 8
5 2012-02 1 a 9
6 2012-02 1 b 11
7 2012-02 2 a 10
8 2012-02 2 b NA

The NAs can be replaced after the fact with whatever values you choose using subsetting and is.na.

R Add Missing Rows by Condition

dplyr/tidyr

library(dplyr)
library(tidyr)
dat %>%
complete(Species, Date, Site, fill = list(n = 0))
# # A tibble: 15 x 4
# Species Date Site n
# <chr> <chr> <chr> <dbl>
# 1 AMCR 6/1/2021 BMA 1
# 2 AMCR 6/1/2021 SVA 0
# 3 AMCR 6/1/2021 SVC 14
# 4 AMCR 6/15/2021 BMA 0
# 5 AMCR 6/15/2021 SVA 9
# 6 AMCR 6/15/2021 SVC 0
# 7 AMCR 6/21/2021 BMA 0
# 8 AMCR 6/21/2021 SVA 18
# 9 AMCR 6/21/2021 SVC 0
# 10 AMCR 6/29/2021 BMA 0
# 11 AMCR 6/29/2021 SVA 18
# 12 AMCR 6/29/2021 SVC 0
# 13 AMCR 6/7/2021 BMA 0
# 14 AMCR 6/7/2021 SVA 2
# 15 AMCR 6/7/2021 SVC 0

base R

dat2 <- merge(dat, do.call(expand.grid, lapply(dat[,1:3], unique)), by = names(dat)[1:3], all = TRUE)
dat2
# Species Date Site n
# 1 AMCR 6/1/2021 BMA 1
# 2 AMCR 6/1/2021 SVA NA
# 3 AMCR 6/1/2021 SVC 14
# 4 AMCR 6/15/2021 BMA NA
# 5 AMCR 6/15/2021 SVA 9
# 6 AMCR 6/15/2021 SVC NA
# 7 AMCR 6/21/2021 BMA NA
# 8 AMCR 6/21/2021 SVA 18
# 9 AMCR 6/21/2021 SVC NA
# 10 AMCR 6/29/2021 BMA NA
# 11 AMCR 6/29/2021 SVA 18
# 12 AMCR 6/29/2021 SVC NA
# 13 AMCR 6/7/2021 BMA NA
# 14 AMCR 6/7/2021 SVA 2
# 15 AMCR 6/7/2021 SVC NA
dat2$n <- ifelse(is.na(dat2$n), 0, dat2$n)
dat2
# Species Date Site n
# 1 AMCR 6/1/2021 BMA 1
# 2 AMCR 6/1/2021 SVA 0
# 3 AMCR 6/1/2021 SVC 14
# 4 AMCR 6/15/2021 BMA 0
# 5 AMCR 6/15/2021 SVA 9
# 6 AMCR 6/15/2021 SVC 0
# 7 AMCR 6/21/2021 BMA 0
# 8 AMCR 6/21/2021 SVA 18
# 9 AMCR 6/21/2021 SVC 0
# 10 AMCR 6/29/2021 BMA 0
# 11 AMCR 6/29/2021 SVA 18
# 12 AMCR 6/29/2021 SVC 0
# 13 AMCR 6/7/2021 BMA 0
# 14 AMCR 6/7/2021 SVA 2
# 15 AMCR 6/7/2021 SVC 0

Data

dat <- structure(list(Species = c("AMCR", "AMCR", "AMCR", "AMCR", "AMCR", "AMCR"), Date = c("6/1/2021", "6/1/2021", "6/7/2021", "6/15/2021", "6/21/2021", "6/29/2021"), Site = c("SVC", "BMA", "SVA", "SVA", "SVA", "SVA"), n = c(14L, 1L, 2L, 9L, 18L, 18L)), class = "data.frame", row.names = c(NA, -6L))

How to fill in missing rows by extending last available row?

I will give you a simple pure tidyverse solution. First, you have to expand grid by combinations of the variables, you can use expand() or complete() to make implicit missing values explicitly missing. Then you want to do LOCF (last observation carried forward), this can be achieved either by the fill argument inside complete() or by using fill() function. All there functions are inside the tidyr package.

library(tidyverse)

data <- data.frame(
ID = c(1,1,2,2,2,2),
Period_1 = c("2020-03", "2020-04", "2020-01", "2020-02", "2020-03", "2020-04"),
Period_2 = c(1, 2, 1, 2, 3, 4),
ColA = c(10, 20, 30, 40, 50, 52),
ColB = c(15, 25, 35, 45, 55, 87)
)

data %>%
tidyr::complete(ID, nesting(Period_2)) %>%
tidyr::fill(ColA, ColB, .direction = "down")
#> # A tibble: 8 x 5
#> ID Period_2 Period_1 ColA ColB
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 1 1 2020-03 10 15
#> 2 1 2 2020-04 20 25
#> 3 1 3 <NA> 20 25
#> 4 1 4 <NA> 20 25
#> 5 2 1 2020-01 30 35
#> 6 2 2 2020-02 40 45
#> 7 2 3 2020-03 50 55
#> 8 2 4 2020-04 52 87

Created on 2022-01-21 by the reprex package (v2.0.1)

In R: is there an elegant way to split data.frame row by , and add to existing rows matching the splitted strings?

We can use separate_rows to split the column, then do a group by operation to get the sum

library(dplyr)
library(tidyr)
df %>%
separate_rows(sample_names) %>%
group_by(sample_names) %>%
summarise(sample_values = sum(sample_values), .groups = 'drop')

-output

# A tibble: 2 x 2
# sample_names sample_values
# <chr> <dbl>
#1 bar 8
#2 foo 4

Or with base R by splitting the column with strsplit into a list of vectors, then use tapply to do a group by sum

lst1 <- strsplit(df$sample_names, ",\\s+")
tapply(rep(df$sample_values, lengths(lst1)), unlist(lst1), FUN = sum)

Detect missing (non existing) rows within a dataframe and replace them with NA

library(lubridate)
library(dplyr)

Set up sample data:

dat = read.table(text="Station Date        Day
7002 17/12/1966 77
7002 05/05/1968 582
7002 30/10/1968 760
7002 16/08/1970 1415
7003 02/12/1966 62
7003 05/05/1968 582
7003 31/10/1968 761
8004 04/07/1968 4294
8004 15/11/1968 4428
8006 13/10/1966 5856
8006 23/09/1967 6567
8006 01/09/1968 6910", header=TRUE, stringsAsFactors=FALSE)

dat$Date = as.Date(dat$Date, format=c("%d/%m/%Y"))

Add water year: I've assumed that the water year is named by the year of the start of the water year. For example, water year 01/10/1967 - 30/09/1968 is water year 1967.

dat$water.year = ifelse(month(dat$Date) %in% 1:9, year(dat$Date) - 1, year(dat$Date))

Add rows for missing years: I do this by merging with a new data frame that includes all combinations of Station and water.year.

full_join(expand.grid(Station=unique(dat$Station), water.year=1966:1969),
dat,
by=c("Station","water.year")) %>% arrange(Station, water.year)
   Station water.year       Date  Day
1 7002 1966 1966-12-17 77
2 7002 1967 1968-05-05 582
3 7002 1968 1968-10-30 760
4 7002 1969 1970-08-16 1415
5 7003 1966 1966-12-02 62
6 7003 1967 1968-05-05 582
7 7003 1968 1968-10-31 761
8 7003 1969 <NA> NA
9 8004 1966 <NA> NA
10 8004 1967 1968-07-04 4294
11 8004 1968 1968-11-15 4428
12 8004 1969 <NA> NA
13 8006 1966 1966-10-13 5856
14 8006 1966 1967-09-23 6567
15 8006 1967 1968-09-01 6910
16 8006 1968 <NA> NA
17 8006 1969 <NA> NA


Related Topics



Leave a reply



Submit