Anomaly Detection in Python
Bekhruz (Bex) Tuychiev
Kaggle Master, Data Science Content Creator
outliers = airbnb_df[iforest.labels_ == 1]
outlier_probs = iforest.predict_proba(outliers)
outlier_probs[:10]
array([[0.51999538, 0.48000462],
[0.61789522, 0.38210478],
[0.61802032, 0.38197968],
[0.35184434, 0.64815566],
[0.57533286, 0.42466714],
[0.59038933, 0.40961067],
[0.57677613, 0.42322387],
[0.54158826, 0.45841174],
[0.49118093, 0.50881907],
[0.21387357, 0.78612643]])
Threshold:
google.head()
Open High Low Close Volume day_of_week month day
Date
2006-01-03 211.47 218.05 209.32 217.83 13137450 1 1 3
2006-01-04 222.17 224.70 220.09 222.84 15292353 2 1 4
2006-01-05 223.22 226.00 220.97 225.85 10815661 3 1 5
2006-01-06 228.66 235.49 226.85 233.06 17759521 4 1 6
2006-01-09 233.44 236.94 230.70 233.68 12795837 0 1 9
from sklearn.preprocessing import QuantileTransformer
# Define the cols to be scaled to_scale = ['Open', 'High', 'Low', 'Close', 'Volume']
# Initiate the transformer qt = QuantileTransformer(output_distribution="normal") # Scale and store the columns back google.loc[:, to_scale] = qt.fit_transform(google[to_scale])
# Create a list of estimators estimators = [KNN(n_neighbors=20), LOF(n_neighbors=20), IForest()]
# Create an empty array shape = (len(google), len(estimators)) probability_scores = np.empty(shape=shape)
estimators = [KNN(n_neighbors=20), LOF(n_neighbors=20), IForest()] shape = (len(google), len(estimators)) probability_scores = np.empty(shape=shape)
# Loop over and fit for index, est in enumerate(estimators): est.fit(google)
# Create probabilities probs = est.predict_proba(google)
# Store the probs probability_scores[:, index] = probs[:, 1]
mean_scores = np.mean(probability_scores, axis=1)
mean_scores
array([0.20699869, 0.21455413, 0.17166271, ..., 0.31255075, 0.33553513,
0.32217186])
median_scores = np.mean(probability_scores, axis=1)
median_scores
array([0.20699869, 0.21455413, 0.17166271, ..., 0.31255075, 0.33553513,
0.32217186])
# Create a mask with 75% threshold is_outlier = median_scores > 0.75
# Filter the outliers outliers = google[is_outlier]
len(outliers)
3
# Create a list of estimators estimators = [KNN(n_neighbors=20), LOF(n_neighbors=20), IForest()] probability_scores = np.empty(shape=(len(google), len(estimators)))
for index, est in enumerate(estimators): # Fit and generate probabilities est.fit(google) probs = est.predict_proba(google) # Store the probabilities probability_scores[:, index] = probs[:, 1]
# Average the scores
mean_scores = np.mean(probability_scores, axis=1)
# Filter with 75% threshold
outliers = google[mean_scores > 0.75]
print(len(outliers))
3
Anomaly Detection in Python