R - Insert Row for Missing Monthly Data and Interpolate

r - insert row for missing monthly data and interpolate

Using data.table and zoo packages you can easily expand your data set and interpolate as long as you don't have NAs at both sizes of the year

Expend the data set

library(data.table)
library(zoo)
res <- setDT(df)[, .SD[match(1:12, Month)], by = Year]

Interpolate on whatever column you want

cols <- c("Month", "DecimDate", "TWS")
res[, (cols) := lapply(.SD, na.approx, na.rm = FALSE), .SDcols = cols]

res
# Year GridNo GridIndex Lon Lat DecimDate Month TWS
# 1: 2003 GR72 72 35.5 -4.5 2003.000 1 14.2566781
# 2: 2003 GR72 72 35.5 -4.5 2003.083 2 5.0413706
# 3: 2003 GR72 72 35.5 -4.5 2003.167 3 3.8192721
# 4: 2003 GR72 72 35.5 -4.5 2003.250 4 5.8706026
# 5: 2003 GR72 72 35.5 -4.5 2003.333 5 7.8461188
# 6: 2003 NA NA NA NA 2003.417 6 5.1141516
# 7: 2003 GR72 72 35.5 -4.5 2003.500 7 2.3821844
# 8: 2003 GR72 72 35.5 -4.5 2003.583 8 0.1995629
# 9: 2003 GR72 72 35.5 -4.5 2003.667 9 -1.8353604
# 10: 2003 GR72 72 35.5 -4.5 2003.750 10 -2.0410653
# 11: 2003 GR72 72 35.5 -4.5 2003.833 11 -1.4029813
# 12: 2003 GR72 72 35.5 -4.5 2003.917 12 -0.2206872
# 13: 2004 GR72 72 35.5 -4.5 2004.000 1 -0.5090872
# 14: 2004 GR72 72 35.5 -4.5 2004.083 2 -0.4887118
# 15: 2004 GR72 72 35.5 -4.5 2004.167 3 -0.7725966
# 16: 2004 GR72 72 35.5 -4.5 2004.250 4 4.1831581
# 17: 2004 GR72 72 35.5 -4.5 2004.333 5 2.5651040
# 18: 2004 GR72 72 35.5 -4.5 2004.417 6 -2.2511409
# 19: 2004 GR72 72 35.5 -4.5 2004.500 7 -1.6484375
# 20: 2004 GR72 72 35.5 -4.5 2004.583 8 -4.6508982
# 21: 2004 GR72 72 35.5 -4.5 2004.667 9 -5.0053745
# 22: 2004 NA NA NA NA NA NA NA
# 23: 2004 NA NA NA NA NA NA NA
# 24: 2004 NA NA NA NA NA NA NA

Insert rows for missing data and interpolate

This could help using a spline:

library(zoo)

#Data
df <- structure(list(Date = structure(c(17075, 17106, 17318, 17471,
18017, 18109, 18201, 18414), class = "Date"), Accumulated = c(6902000L,
9033000L, 15033000L, 24033000L, 24533000L, 25033000L, 27533000L,
29033000L)), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8"), class = "data.frame")

#Create seq of dates
df$Date <- as.Date(df$Date)
dfm <- data.frame(Date=seq(min(df$Date),max(df$Date),by='1 month'))
#Now merge
dfmerged <- merge(dfm,df,by = 'Date',all.x=T)
#Now add interpolation
dfmerged$Interpolation <- na.spline(dfmerged$Accumulated)

It will produce:

         Date Accumulated Interpolation
1 2016-10-01 6902000 6902000
2 2016-11-01 9033000 9033000
3 2016-12-01 NA 10525685
4 2017-01-01 NA 11534406
5 2017-02-01 NA 12222432
6 2017-03-01 NA 12753035
7 2017-04-01 NA 13289484
8 2017-05-01 NA 13995049
9 2017-06-01 15033000 15033000
10 2017-07-01 NA 16511487
11 2017-08-01 NA 18318181
12 2017-09-01 NA 20285631
13 2017-10-01 NA 22246387
14 2017-11-01 24033000 24033000
15 2017-12-01 NA 25510428
16 2018-01-01 NA 26673271
17 2018-02-01 NA 27548534
18 2018-03-01 NA 28163225
19 2018-04-01 NA 28544352
20 2018-05-01 NA 28718923
21 2018-06-01 NA 28713943
22 2018-07-01 NA 28556422
23 2018-08-01 NA 28273365
24 2018-09-01 NA 27891781
25 2018-10-01 NA 27438677
26 2018-11-01 NA 26941060
27 2018-12-01 NA 26425938
28 2019-01-01 NA 25920317
29 2019-02-01 NA 25451205
30 2019-03-01 NA 25045611
31 2019-04-01 NA 24730540
32 2019-05-01 24533000 24533000
33 2019-06-01 NA 24484346
34 2019-07-01 NA 24633317
35 2019-08-01 25033000 25033000
36 2019-09-01 NA 25709290
37 2019-10-01 NA 26579313
38 2019-11-01 27533000 27533000
39 2019-12-01 NA 28465321
40 2020-01-01 NA 29291385
41 2020-02-01 NA 29931341
42 2020-03-01 NA 30305333
43 2020-04-01 NA 30333510
44 2020-05-01 NA 29936017
45 2020-06-01 29033000 29033000

Interpolate and insert missing rows into dataframe R

one approach, adapt to your case as appropriate:

library(dplyr)
library(lubridate) ## facilitates date-time manipulations

## example data:
patchy_data <- data.frame(date = as.Date('2021-11-01') + sample(1:10, 6),
value = rnorm(12)) %>%
arrange(date)

## create vector of -only!- missing dates:
missing_dates <-
setdiff(
seq.Date(from = min(patchy_data$date),
to = max(patchy_data$date),
by = '1 day'
),
patchy_data$date
) %>% as.Date(origin = '1970-01-01')

## extend initial dataframe with rows per missing date:
full_data <-
patchy_data %>%
bind_rows(data.frame(date = missing_dates,
value = NA)
) %>%
arrange(date)

## group by month and impute missing data from monthwise statistic:
full_data %>%
mutate(month = lubridate::month(date)) %>%
group_by(month) %>%
## coalesce conveniently replaces ifelse-constructs to replace NAs
mutate(imputed = coalesce(.$value, mean(.$value, na.rm = TRUE)))

edit
One possibility to granulate generated data (missing dates) with additional parameters (e. g. measuring depths) is to use expand.grid as follows. Assuming object names from previous code:

## depths of daily measurements:
observation_depths <- c(0.5, 1.1, 1.5) ## example

## generate dataframe with missing dates x depths:
missing_dates_and_depths <-
setNames(expand.grid(missing_dates, observation_depths),
c('date','depthR')
)

## stack both dataframes as above:
full_data <-
patchy_data %>%
bind_rows(missing_dates_and_depths) %>%
arrange(date)

Insert rows for missing dates/times

I think the easiest thing ist to set Date first as already described, convert to zoo, and then just set a merge:

df$timestamp<-as.POSIXct(df$timestamp,format="%m/%d/%y %H:%M")

df1.zoo<-zoo(df[,-1],df[,1]) #set date to Index

df2 <- merge(df1.zoo,zoo(,seq(start(df1.zoo),end(df1.zoo),by="min")), all=TRUE)

Start and end are given from your df1 (original data) and you are setting by - e.g min - as you need for your example. all=TRUE sets all missing values at the missing dates to NAs.

Interpolation' of a missing date/value in R?

We can use complete

library(dplyr)
library(tidyr)
df1 %>%
complete(Month = seq(min(Month), max(Month), by = '1 month')) %>%
fill(CumulativeSum)

-output

# A tibble: 7 x 2
# Month CumulativeSum
# <date> <int>
#1 2019-02-01 40
#2 2019-03-01 70
#3 2019-04-01 80
#4 2019-05-01 80
#5 2019-06-01 80
#6 2019-07-01 100
#7 2019-08-01 120

data

df1 <- structure(list(Month = structure(c(17928, 17956, 17987, 18078, 
18109), class = "Date"), CumulativeSum = c(40L, 70L, 80L, 100L,
120L)), row.names = c(NA, -5L), class = "data.frame")

Insert new rows of imputed data into data table by group

You can use tidyr::complete to expand the years and zoo::na.approx for interpolation of the values.

library(dplyr)

dt %>%
group_by(ID, attrib1, attrib2) %>%
tidyr::complete(Year = min(Year):max(Year)) %>%
mutate(value = zoo::na.approx(value))

# ID attrib1 attrib2 Year value
# <int> <chr> <chr> <dbl> <dbl>
# 1 1 sdf 444 1990 12
# 2 1 sdf 444 1991 10.8
# 3 1 sdf 444 1992 9.6
# 4 1 sdf 444 1993 8.4
# 5 1 sdf 444 1994 7.2
# 6 1 sdf 444 1995 6
# 7 1 sdf 444 1996 7
# 8 2 gghgf 222 1990 6
# 9 2 gghgf 222 1991 5.4
#10 2 gghgf 222 1992 4.8
# … with 11 more rows

Interpolate zoo object with missing Dates

Merge with an "empty" object that has all the dates you want, then use na.approx (or na.spline, etc.) to fill in the missing values.

x <- merge(serie, zoo(,seq(start(serie),end(serie),by="day")), all=TRUE)
x <- na.approx(x)

linear interpolate missing values in time series

Here is one way. I created a data frame with a sequence of date using the first and last date. Using full_join() in the dplyr package, I merged the data frame and mydf. I then used na.approx() in the zoo package to handle the interpolation in the mutate() part.

mydf <- data.frame(date = as.Date(c("2015-10-05","2015-10-08","2015-10-09",
"2015-10-12","2015-10-14")),
value = c(8,3,9,NA,5))

library(dplyr)
library(zoo)

data.frame(date = seq(mydf$date[1], mydf$date[nrow(mydf)], by = 1)) %>%
full_join(mydf, by = "date") %>%
mutate(approx = na.approx(value))

# date value approx
#1 2015-10-05 8 8.000000
#2 2015-10-06 NA 6.333333
#3 2015-10-07 NA 4.666667
#4 2015-10-08 3 3.000000
#5 2015-10-09 9 9.000000
#6 2015-10-10 NA 8.200000
#7 2015-10-11 NA 7.400000
#8 2015-10-12 NA 6.600000
#9 2015-10-13 NA 5.800000
#10 2015-10-14 5 5.000000


Related Topics



Leave a reply



Submit