Anomaly Detection in Python
Bekhruz (Bex) Tuychiev
Kaggle Master, Data Science Content Creator
import pandas as pd
big_mart = pd.read_csv("big_mart_sales.csv")
big_mart.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7060 entries, 0 to 7059
Data columns (total 5 columns):
# Column Non-Null Count Dtype
0 weight 7060 non-null float64
1 fat_content 7060 non-null object
2 type 7060 non-null object
3 max_retail_price 7060 non-null float64
4 sales 7060 non-null float64
dtypes: float64(3), object(2)
big_mart = pd.get_dummies(big_mart)
weight max_retail_price sales fat_content_low_fat fat_content_regular
0 9.30 249.8092 3735.1380 1 0
1 5.92 48.2692 443.4228 0 1
2 17.50 141.6180 2097.2700 1 0
3 19.20 182.0950 732.3800 0 1
4 8.93 53.8614 994.7052 1 0
def evaluate_outlier_classifier(model, data):
# Get labels
labels = model.fit_predict(data)
# Return inliers
return data[labels == 0]
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
def evaluate_regressor(inliers): X = inliers.drop("sales", axis=1) y = inliers[['sales']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10) lr = LinearRegression() lr.fit(X_train, y_train)
preds = lr.predict(X_test) rmse = mean_squared_error(y_test, preds, squared=False) return round(rmse, 3)
contaminations = [0.05, 0.1, 0.2, 0.3] scores = dict()
for c in contaminations: # Instantiate IForest with the current c iforest = IForest(contamination=c, random_state=10)
# Get inliers with the current IForest inliers = evaluate_outlier_classifier(iforest, big_mart)
# Calculate and store RMSE into scores scores[c] = evaluate_regressor(inliers)
print(scores)
{0.05: 1148.555, 0.1: 1147.48, 0.2: 1082.307, 0.3: 1029.33}
estimators = [100, 200, 300,]
max_samples = [0.6, 0.8, 1]
scores = dict()
from itertools import product
list(product(estimators, max_samples))
[(100, 0.6),
(100, 0.8),
(100, 1),
(200, 0.6),
(200, 0.8),
(200, 1),
(300, 0.6),
(300, 0.8),
(300, 1)]
estimators = [100, 200, 300,] max_samples = [0.6, 0.8, 1] scores = dict()
for e, m in product(estimators, max_samples):
# Instantiate an IForest iforest = IForest(n_estimators=e, max_samples=m, contamination=.3)
# Get the inliers with the current IForest inliers = evaluate_outlier_classifier(iforest, big_mart) # Calculate and store RMSE into scores scores[(e, m)] = evaluate_regressor(inliers)
print(scores)
{(100, 0.6): 959.398,
(100, 0.8): 986.056,
(100, 1): 1195.875,
(200, 0.6): 947.628,
(200, 0.8): 933.115,
(200, 1): 1195.875,
(300, 0.6): 949.412,
(300, 0.8): 935.962,
(300, 1): 1195.875}
# Faster compuation with n_jobs=-1
iforest = IForest(n_estimators=1000, n_jobs=-1)
iforest.fit(big_mart)
Anomaly Detection in Python