Interpolate Na Values

linear interpolate missing values in time series

Here is one way. I created a data frame with a sequence of date using the first and last date. Using full_join() in the dplyr package, I merged the data frame and mydf. I then used na.approx() in the zoo package to handle the interpolation in the mutate() part.

mydf <- data.frame(date = as.Date(c("2015-10-05","2015-10-08","2015-10-09",
                                    "2015-10-12","2015-10-14")),       
                   value = c(8,3,9,NA,5))

library(dplyr)
library(zoo)

data.frame(date = seq(mydf$date[1], mydf$date[nrow(mydf)], by = 1)) %>%
full_join(mydf, by = "date") %>%
mutate(approx = na.approx(value))

#         date value   approx
#1  2015-10-05     8 8.000000
#2  2015-10-06    NA 6.333333
#3  2015-10-07    NA 4.666667
#4  2015-10-08     3 3.000000
#5  2015-10-09     9 9.000000
#6  2015-10-10    NA 8.200000
#7  2015-10-11    NA 7.400000
#8  2015-10-12    NA 6.600000
#9  2015-10-13    NA 5.800000
#10 2015-10-14     5 5.000000

Interpolate NA values

Using the zoo package:

library(zoo)
Cz <- zoo(C)
index(Cz) <- Cz[,1]
Cz_approx <- na.approx(Cz)

Interpolate NA values when column ends on NA

Add na.rm=F to remove the error message. Add rule=2 to get the value from the last non-NA value.

df %>%
  mutate(Diam_intpl = na.approx(Diam_av, na.rm=F),
         Diam_intpl2 = na.approx(Diam_av, na.rm=F, rule=2))

   Diam_av Diam_intpl Diam_intpl2
1    12.30      12.30       12.30
2    13.00      13.00       13.00
3    15.50      15.50       15.50
4       NA      15.14       15.14
5       NA      14.78       14.78
6       NA      14.42       14.42
7       NA      14.06       14.06
8    13.70      13.70       13.70
9       NA      12.77       12.77
10      NA      11.84       11.84
11      NA      10.91       10.91
12    9.98       9.98        9.98
13    4.00       4.00        4.00
14    0.00       0.00        0.00
15    8.76       8.76        8.76
16      NA         NA        8.76
17      NA         NA        8.76
18      NA         NA        8.76

data.table linearly interpolating NA values without groups

The code is explained inline. You can delete the temporary columns using df[,dist_before := NULL], for example.

library(data.table)
df=data.table(time=seq(7173,7195,1),dist=c(31091.33,NA,31100.00,31103.27,NA,NA,NA,
NA,31124.98,NA,31132.81,NA,NA,NA,NA,31154.19,NA,31161.47,NA,NA,NA,NA,31182.97))
df
#>     time     dist
#>  1: 7173 31091.33
#>  2: 7174       NA
#>  3: 7175 31100.00
#>  4: 7176 31103.27
#>  5: 7177       NA
#>  6: 7178       NA
#>  7: 7179       NA
#>  8: 7180       NA
#>  9: 7181 31124.98
#> 10: 7182       NA
#> 11: 7183 31132.81
#> 12: 7184       NA
#> 13: 7185       NA
#> 14: 7186       NA
#> 15: 7187       NA
#> 16: 7188 31154.19
#> 17: 7189       NA
#> 18: 7190 31161.47
#> 19: 7191       NA
#> 20: 7192       NA
#> 21: 7193       NA
#> 22: 7194       NA
#> 23: 7195 31182.97
#>     time     dist
# Carry forward the last non-missing observation
df[,dist_before := nafill(dist, "locf")]
# Bring back the next non-missing dist
df[,dist_after := nafill(dist, "nocb")]
# rleid will create groups based on run-lengths of values within the data.
# This means 4 NA's in a row will be grouped together, for example.
# We then count the missings and add 1, because we want the 
# last NA before the next non-missing to be less than the non-missing value.
df[, rle := rleid(dist)][,missings := max(.N +  1 , 2), by = rle][]
#>     time     dist dist_before dist_after rle missings
#>  1: 7173 31091.33    31091.33   31091.33   1        2
#>  2: 7174       NA    31091.33   31100.00   2        2
#>  3: 7175 31100.00    31100.00   31100.00   3        2
#>  4: 7176 31103.27    31103.27   31103.27   4        2
#>  5: 7177       NA    31103.27   31124.98   5        5
#>  6: 7178       NA    31103.27   31124.98   5        5
#>  7: 7179       NA    31103.27   31124.98   5        5
#>  8: 7180       NA    31103.27   31124.98   5        5
#>  9: 7181 31124.98    31124.98   31124.98   6        2
#> 10: 7182       NA    31124.98   31132.81   7        2
#> 11: 7183 31132.81    31132.81   31132.81   8        2
#> 12: 7184       NA    31132.81   31154.19   9        5
#> 13: 7185       NA    31132.81   31154.19   9        5
#> 14: 7186       NA    31132.81   31154.19   9        5
#> 15: 7187       NA    31132.81   31154.19   9        5
#> 16: 7188 31154.19    31154.19   31154.19  10        2
#> 17: 7189       NA    31154.19   31161.47  11        2
#> 18: 7190 31161.47    31161.47   31161.47  12        2
#> 19: 7191       NA    31161.47   31182.97  13        5
#> 20: 7192       NA    31161.47   31182.97  13        5
#> 21: 7193       NA    31161.47   31182.97  13        5
#> 22: 7194       NA    31161.47   31182.97  13        5
#> 23: 7195 31182.97    31182.97   31182.97  14        2
#>     time     dist dist_before dist_after rle missings
# .SD[,.I] will get us the row number relative to the group it is in. 
# For example, row 5 dist is calculated as
# dist_before + 1 * (dist_after - dist_before)/5
df[is.na(dist), dist := dist_before + .SD[,.I] *
                     (dist_after - dist_before)/(missings), by = rle]
df[]
#>     time     dist dist_before dist_after rle missings
#>  1: 7173 31091.33    31091.33   31091.33   1        2
#>  2: 7174 31095.67    31091.33   31100.00   2        2
#>  3: 7175 31100.00    31100.00   31100.00   3        2
#>  4: 7176 31103.27    31103.27   31103.27   4        2
#>  5: 7177 31107.61    31103.27   31124.98   5        5
#>  6: 7178 31111.95    31103.27   31124.98   5        5
#>  7: 7179 31116.30    31103.27   31124.98   5        5
#>  8: 7180 31120.64    31103.27   31124.98   5        5
#>  9: 7181 31124.98    31124.98   31124.98   6        2
#> 10: 7182 31128.90    31124.98   31132.81   7        2
#> 11: 7183 31132.81    31132.81   31132.81   8        2
#> 12: 7184 31137.09    31132.81   31154.19   9        5
#> 13: 7185 31141.36    31132.81   31154.19   9        5
#> 14: 7186 31145.64    31132.81   31154.19   9        5
#> 15: 7187 31149.91    31132.81   31154.19   9        5
#> 16: 7188 31154.19    31154.19   31154.19  10        2
#> 17: 7189 31157.83    31154.19   31161.47  11        2
#> 18: 7190 31161.47    31161.47   31161.47  12        2
#> 19: 7191 31165.77    31161.47   31182.97  13        5
#> 20: 7192 31170.07    31161.47   31182.97  13        5
#> 21: 7193 31174.37    31161.47   31182.97  13        5
#> 22: 7194 31178.67    31161.47   31182.97  13        5
#> 23: 7195 31182.97    31182.97   31182.97  14        2
#>     time     dist dist_before dist_after rle missings

Pandas interpolation function fails to interpolate after replacing values with .nan

If you first find and replace any value that is not a digit, that should fix your issue.

#Import modules
import pandas as pd
import numpy as np

#Import data
df = pd.read_csv('example.csv')

df['example'] = df.example.replace(r'[^\d]',np.nan,regex=True)
pd.to_numeric(df.example)

Interpolation of missing values

Using Pandas you can use the interpolate() method.

import pandas as pd                                                                                                                                                                                                                                                                                                                    

df = pd.read_csv("input.csv")

The dataframe now looks like this:

         Date  Column_1  Column_2  Column_3
0  2020-06-26         1       3.0         5
1  2020-06-27         2       NaN         4
2  2020-06-28         5       NaN         6
3  2020-06-29         7       8.0        10

Using interpolate() on the column with missing data fills the gaps.

df['Column_2'].interpolate()                                                                                                                                                                                                                                                                                                            
0    3.000000
1    4.666667
2    6.333333
3    8.000000
Name: Column_2, dtype: float64

Now we can assign that back into the dataframe

df['Column_2'] = df['Column_2'].interpolate()

Results in

         Date  Column_1  Column_2  Column_3
0  2020-06-26         1  3.000000         5
1  2020-06-27         2  4.666667         4
2  2020-06-28         5  6.333333         6
3  2020-06-29         7  8.000000        10

Interpolation of missing values in 3D data-array in python

The problem is that the NaN data may form blocks, where you cannot interpolate from the neighbors.

A solution is to do Gauss Seidel interpolation solving the Laplace equation, (which creates data minimizing the curvature of the function).

This code finds the NaN values and does a 3D interpolation. I do not have access to your data, so it is done over synthetic data.

import numpy as np
import matplotlib.pyplot as plt


# create data
print("Creating data...")
size = 10  # 3D matrix of size: size³
# create x,y,z grid
x, y, z = np.meshgrid(np.arange(0, size), np.arange(
    0, size), np.arange(0, size))


def f(x, y, z):
    """function to create synthetic data"""
    return np.sin((x+y+z)/2)


data = np.zeros((size, size, size))
data[x, y, z] = f(x, y, z)

# create corrupted data
sizeCorruptedData = int(data.size*.2)  # 20% of data is corrupted
# create random x,y,z index for NaN values
xc, yc, zc = np.random.randint(0, size, (3, sizeCorruptedData))

corruptedData = data.copy()
corruptedData[xc, yc, zc] = np.nan

# Interpolate on NaN values
print("Interpolating...")

# get index of nan in corrupted data
nanIndex = np.isnan(corruptedData).nonzero()

interpolatedData = data.copy()
# make an initial guess for the interpolated data using the mean of the non NaN values
interpolatedData[nanIndex] = np.nanmean(corruptedData)


def sign(x):
    """returns the sign of the neighbor to be averaged for boundary elements"""
    if x == 0:
        return [1, 1]
    elif x == size-1:
        return [-1, -1]
    else:
        return [-1, 1]

#calculate kernels for the averages on boundaries/non boundary elements
for i in range(len(nanIndex)):
    nanIndex = *nanIndex, np.array([sign(x) for x in nanIndex[i]])

# gauss seidel iteration to interpolate Nan values with neighbors
# https://en.wikipedia.org/wiki/Gauss%E2%80%93Seidel_method
for _ in range(100):
    for x, y, z, dx, dy, dz in zip(*nanIndex):
        interpolatedData[x, y, z] = (
            (interpolatedData[x+dx[0], y, z] + interpolatedData[x+dx[1], y, z] +
             interpolatedData[x, y+dy[0], z] + interpolatedData[x, y+dy[1], z] +
             interpolatedData[x, y, z+dz[0]] + interpolatedData[x, y, z+dz[1]]) / 6)


# plot results
f, axarr = plt.subplots(2, 2)
axarr[0, 0].imshow(data[:, :, 1])
axarr[0, 0].title.set_text('Original Data')
axarr[0, 1].imshow(corruptedData[:, :, 1])
axarr[0, 1].title.set_text('Corrupted Data')
axarr[1, 0].imshow(interpolatedData[:, :, 1])
axarr[1, 0].title.set_text('Fixed Data')
axarr[1, 1].imshow(data[:, :, 1]-interpolatedData[:, :, 1])
axarr[1, 1].title.set_text('Error = Original-Fixed')
f.tight_layout()
plt.show()

Sample Image

Non linear interpolation to find the missing values using R

Ultimately, you have to decide the interpolation procedure based on scientific background. However, in order to avoid producing negative values, the log-transformation is useful. In the following, I combine that with a spline interpolation.

library(data.table)
test_dt = data.table(group = c("B1", "B1", "B1", "B1", "B1", "B1", 
                               "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"), 
                     a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10), 
                     b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255, 
                           0.04, NA, NA, NA))

library(zoo)
test_dt[, c := exp(na.spline(log(b), x = a, na.rm = FALSE)), by = group]

library(ggplot2)
ggplot(test_dt, aes(x = a, color = group)) +
  geom_line(aes(y = c)) +
  geom_point(aes(y = c, color = "interpolated")) +
  geom_point(aes(y = b))

resulting plot showing original and interpolated values

Interpolating data for missing values pandas python

Try this, assuming the first column of your csv is the one with date strings:

df = pd.read_csv(filename, index_col=0, parse_dates=[0], infer_datetime_format=True)
df2 = df.interpolate(method='time', limit_direction='both')

It theoretically should 1) convert your first column into actual datetime objects, and 2) set the index of the dataframe to that datetime column, all in one step. You can optionally include the infer_datetime_format=True argument. If your datetime format is a standard format, it can help speed up parsing by quite a bit.

The limit_direction='both' should back fill any NaNs in the first row, but because you haven't provided a copy-paste-able sample of your data, I cannot confirm on my end.

Reading the documentation can be incredibly helpful and can usually answer questions faster than you'll get answers from Stack Overflow!