Anomaly Detection in Python
Bekhruz (Bex) Tuychiev
Kaggle Master, Data Science Content Creator
import pandas as pd
google = pd.read_csv("google.csv")
google.head()
Open High Low Close Volume
Date
2006-01-03 211.47 218.05 209.32 217.83 13137450
2006-01-04 222.17 224.70 220.09 222.84 15292353
2006-01-05 223.22 226.00 220.97 225.85 10815661
2006-01-06 228.66 235.49 226.85 233.06 17759521
2006-01-09 233.44 236.94 230.70 233.68 12795837
print(google['Date'].dtype)
object
google['Date'] = pd.to_datetime(google['Date'])
print(google.dtypes)
Date datetime64[ns]
Open float64
High float64
...
google['day_of_week'] = google['Date'].dt.day_of_week google['day_of_month'] = google['Date'].dt.day google['month'] = google['Date'].dt.month
google.sample(5)
Date Open Low Close Volume day_of_week month day_of_month
2016-02-29 721.00 716.84 717.22 2237474 0 2 29
2007-01-24 242.46 241.89 249.78 6074077 2 1 24
2007-05-24 237.81 235.99 237.40 4200474 3 5 24
2008-09-16 213.19 212.96 221.69 6991767 1 9 16
2008-03-31 218.04 216.22 220.46 4446368 0 3 31
google.set_index("Date", inplace=True)
google.head()
Open High Low Close Volume day_of_week month \
Date
2006-01-03 211.47 218.05 209.32 217.83 13137450 1 1
2006-01-04 222.17 224.70 220.09 222.84 15292353 2 1
2006-01-05 223.22 226.00 220.97 225.85 10815661 3 1
2006-01-06 228.66 235.49 226.85 233.06 17759521 4 1
2006-01-09 233.44 236.94 230.70 233.68 12795837 0 1
google["2008": "2010"].head()
Open High Low Close Volume day_of_week month \
Date
2008-01-02 346.78 349.03 339.20 342.94 4306848 2 1
2008-01-03 342.97 343.77 338.60 343.01 3252846 3 1
2008-01-04 340.18 340.82 327.83 328.83 5359834 4 1
2008-01-07 327.30 331.47 318.99 324.95 6404945 0 1
2008-01-08 326.83 330.31 315.82 316.16 5341949 1 1
google["2012-03": "2015-10-04"].head()
Open High Low Close Volume day_of_week month \
Date
2012-03-01 311.44 313.16 309.38 311.51 2238010 3 3
2012-03-02 311.31 312.31 310.47 310.94 1573214 4 3
2012-03-05 310.53 311.56 306.00 307.43 1593250 0 3
2012-03-06 304.33 304.71 297.22 302.78 3175216 1 3
2012-03-07 304.83 305.90 303.23 303.70 1264892 2 3
google = pd.read_csv("google.csv", parse_dates=["Date"], index_col="Date")
google.head()
Open High Low Close Volume
Date
2006-01-03 211.47 218.05 209.32 217.83 13137450
2006-01-04 222.17 224.70 220.09 222.84 15292353
2006-01-05 223.22 226.00 220.97 225.85 10815661
2006-01-06 228.66 235.49 226.85 233.06 17759521
2006-01-09 233.44 236.94 230.70 233.68 12795837
import matplotlib.pyplot as plt
google["Close"].plot(color="red")
plt.title("""Closing prices of Google
stocks from 2006 to 2018.""")
plt.show()
google["2010": "2010-07-01"]['Close'].plot(color="green")
plt.title("Closing prices of Google stocks from January 2010 to July 2010.")
plt.show()
google['Volume'].plot(color='red', figsize=(12, 4))
plt.title("The number of traded Google stocks from 2006 to 2018.")
from pyod.models.mad import MAD mad = MAD().fit(google[['Volume']])
is_outlier = mad.labels_ == 1 print(len(google[is_outlier]))
236
google['day_of_week'] = google.index.day_of_week
google['month'] = google.index.month
google['day_of_month'] = google.index.day
google.head()
Open High Low Close Volume day_of_week month \
Date
2006-01-03 211.47 218.05 209.32 217.83 13137450 1 1
2006-01-04 222.17 224.70 220.09 222.84 15292353 2 1
2006-01-05 223.22 226.00 220.97 225.85 10815661 3 1
2006-01-06 228.66 235.49 226.85 233.06 17759521 4 1
2006-01-09 233.44 236.94 230.70 233.68 12795837 0 1
from pyod.models.iforest import IForest iforest = IForest().fit(google)
# Generate probabilities probs = iforest.predict_proba(google) # Isolate the outliers is_outlier = probs[:, 1] > 0.75 outliers = google[is_outlier] print(len(outliers))
60
Anomaly Detection in Python