Machine Learning for Time Series Data in Python
Chris Holdgraf
Fellow, Berkeley Institute for Data Science
# Return a boolean that notes where missing values are
missing = prices.isna()
# Interpolate linearly within missing windows
prices_interp = prices.interpolate('linear')
# Plot the interpolated data in red and the data w/ missing values in black
ax = prices_interp.plot(c='r')
prices.plot(c='k', ax=ax, lw=2)
def percent_change(values):
"""Calculates the % change between the last value
and the mean of previous values"""
# Separate the last value and all previous values into variables
previous_values = values[:-1]
last_value = values[-1]
# Calculate the % difference between the last value
# and the mean of earlier values
percent_change = (last_value - np.mean(previous_values)) \
/ np.mean(previous_values)
return percent_change
# Plot the raw data
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
ax = prices.plot(ax=axs[0])
# Calculate % change and plot
ax = prices.rolling(window=20).aggregate(percent_change).plot(ax=axs[1])
ax.legend_.set_visible(False)
Be very careful about doing this - often it is difficult to determine what is a legitimately extreme value vs an abberation
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
for data, ax in zip([prices, prices_perc_change], axs):
# Calculate the mean / standard deviation for the data
this_mean = data.mean()
this_std = data.std()
# Plot the data, with a window that is 3 standard deviations
# around the mean
data.plot(ax=ax)
ax.axhline(this_mean + this_std * 3, ls='--', c='r')
ax.axhline(this_mean - this_std * 3, ls='--', c='r')
# Center the data so the mean is 0
prices_outlier_centered = prices_outlier_perc - prices_outlier_perc.mean()
# Calculate standard deviation
std = prices_outlier_perc.std()
# Use the absolute value of each datapoint
# to make it easier to find outliers
outliers = np.abs(prices_outlier_centered) > (std * 3)
# Replace outliers with the median value
# We'll use np.nanmean since there may be nans around the outliers
prices_outlier_fixed = prices_outlier_centered.copy()
prices_outlier_fixed[outliers] = np.nanmedian(prices_outlier_fixed)
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
prices_outlier_centered.plot(ax=axs[0])
prices_outlier_fixed.plot(ax=axs[1])
Machine Learning for Time Series Data in Python