Interpolating Timeseries

Interpolating timeseries

I would use zoo (or xts) and do it like this:

library(zoo)
# Create zoo objects
zc <- zoo(calib$value, calib$time) # low freq
zs <- zoo(sample$value, sample$time) # high freq
# Merge series into one object
z <- merge(zs,zc)
# Interpolate calibration data (na.spline could also be used)
z$zc <- na.approx(z$zc, rule=2)
# Only keep index values from sample data
Z <- z[index(zs),]
Z
# zs zc
# 2012-10-25 01:00:52 256 252.3000
# 2012-10-25 01:03:02 254 251.1142
# 2012-10-25 01:05:23 255 249.9617
# 2012-10-25 01:07:42 257 252.7707
# 2012-10-25 01:10:12 256 255.6000

How to interpolate time series based on time gap between non null values in PySpark

Having this input dataframe:

df = spark.createDataFrame([
(1642205833225, 58.00), (1642205888654, float('nan')),
(1642205899657, float('nan')), (1642205899970, 55.00),
(1642206338180, float('nan')), (1642206353652, 56.45),
(1642206853451, float('nan')), (1642207353652, 80.45)
], ["timestamp", "value"])

# replace NaN value by Nulls
df = df.replace(float("nan"), None, ["value"])

You can use some window functions (last, first) to get next and previous non null values for each row and calculate the time gap like this:

from pyspark.sql import functions as F, Window

w1 = Window.orderBy("timestamp").rowsBetween(1, Window.unboundedFollowing)
w2 = Window.orderBy("timestamp").rowsBetween(Window.unboundedPreceding, -1)

df = (
df.withColumn("rn", F.row_number().over(Window.orderBy("timestamp")))
.withColumn("next_val", F.first("value", ignorenulls=True).over(w1))
.withColumn("next_rn", F.first(F.when(F.col("value").isNotNull(), F.col("rn")), ignorenulls=True).over(w1))
.withColumn("prev_val", F.last("value", ignorenulls=True).over(w2))
.withColumn("prev_rn", F.last(F.when(F.col("value").isNotNull(), F.col("rn")), ignorenulls=True).over(w2))
.withColumn("timegap_to_next", F.when(F.col("value").isNotNull(), F.min(F.when(F.col("value").isNotNull(), F.col("timestamp"))).over(w1) - F.col("timestamp")))
)

Now, you can do the linear interpolation of column value depending on your threshold using when expression:

w3 = Window.orderBy("timestamp").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df = df.withColumn(
"value",
F.coalesce(
"value",
F.when(
F.last("timegap_to_next", ignorenulls=True).over(w3) < 500000,
(F.col("prev_val") +
((F.col("next_val") - F.col("prev_val"))/
(F.col("next_timestamp") - F.col("prev_next_timestamp"))
* (F.col("timestamp") - F.col("prev_next_timestamp")
)
)
)
)
)
).select("timestamp", "value", "timegap_to_next")

df.show()

#+-------------+------+---------------+
#| timestamp| value|timegap_to_next|
#+-------------+------+---------------+
#|1642205833225| 58.0| 66745|
#|1642205888654| 56.0| null|
#|1642205899657| 57.0| null|
#|1642205899970| 55.0| 453682|
#|1642206338180|55.725| null|
#|1642206353652| 56.45| 1000000|
#|1642206853451| null| null|
#|1642207353652| 80.45| null|
#+-------------+------+---------------+

Interpolating a time series with interp1d using only numpy

The data type of any element in T is np.datetime64 and not np.timedelta64.

Thus, convert the dtype of all elements of T to np.timedelta64 by creating a numpy array with datatype m:

T = np.array(
np.datetime64('2020-01-01T00:00:00.000000000'),
np.datetime64('2020-01-02T00:00:00.000000000'),
np.datetime64('2020-01-03T00:00:00.000000000'),
np.datetime64('2020-01-05T00:00:00.000000000'),
np.datetime64('2020-01-06T00:00:00.000000000'),
np.datetime64('2020-01-09T00:00:00.000000000'),
np.datetime64('2020-01-13T00:00:00.000000000'),
dtype='m')

Then, as the documentation suggests, we have to pass x and y that are convertible to float like values to scipy.interpolate.interp1d to get a interpolation function. We'll use a method suggested in this answer to do that:

# Get an interpolation function f
f = scipy.interpolation.interp1d(x=T/np.timedelta64(1, 's'), y=Z)

Finally, we can use the interpolated function as follows for plotting:

ax.plot(T, f(T/np.timedelta64(1, 's'), '-')

Combining everything, we get the following output:

Sample Image

The code that can reproduce the image:

import numpy as np
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

T = np.array([
np.datetime64('2020-01-01T00:00:00.000000000'),
np.datetime64('2020-01-02T00:00:00.000000000'),
np.datetime64('2020-01-03T00:00:00.000000000'),
np.datetime64('2020-01-05T00:00:00.000000000'),
np.datetime64('2020-01-06T00:00:00.000000000'),
np.datetime64('2020-01-09T00:00:00.000000000'),
np.datetime64('2020-01-13T00:00:00.000000000'),
], dtype='m')

Z = [543, 234, 435, 765, 564, 235, 345]

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot()
ax.plot(T, Z, 'o')

f = interp1d(x=T/np.timedelta64(1, 's'), y=Z)

ax.plot(T, f(T/np.timedelta64(1, 's')), '-')
plt.show()

resample/interpolate time series with datetimeindex

A simple new_df = pd.concat((df1,df2), axis=1) retains all information and timestamps. You can choose to resample new_df as wished.

In this specific case, you can do:

pd.concat((df1, df2.groupby(df2.index.floor('H')).mean()), axis=1)

Output:

                        a       b       c       d       e
idx
2013-01-01 06:00:00 NaN NaN NaN NaN 9.000000
2013-01-01 07:00:00 0.45 24.33 9.04 0.00 9.000000
2013-01-01 08:00:00 0.55 23.11 11.60 0.06 10.666667
2013-01-01 09:00:00 0.69 27.23 18.18 0.03 12.666667
2013-01-01 10:00:00 0.64 26.58 31.46 0.06 14.666667
2013-01-01 11:00:00 0.36 17.50 42.58 0.29 14.000000
2013-01-01 12:00:00 0.32 15.39 50.30 0.17 14.000000
2013-01-01 13:00:00 0.41 17.73 51.45 0.13 14.000000
2013-01-01 14:00:00 0.50 19.48 50.50 0.05 14.000000
2013-01-01 15:00:00 0.48 18.32 51.51 0.03 13.333333
2013-01-01 16:00:00 0.50 18.49 50.70 0.02 13.000000
2013-01-01 17:00:00 1.13 32.89 40.07 0.20 12.000000
2013-01-01 18:00:00 1.81 59.64 16.59 0.37 11.000000

Pandas timeseries resampling and interpolating together

d = df.set_index('tstamp')
t = d.index
r = pd.date_range(t.min().date(), periods=24*60, freq='T')

d.reindex(t.union(r)).interpolate('index').ix[r]

Sample Image


Note, periods=24*60 works on daily data, not on the sample provided in the question. For that sample, periods=6 will work.



Related Topics



Leave a reply



Submit