Supervised Learning with scikit-learn
George Boorman
Core Curriculum Manager, DataCamp
Some guiding principles
Regression model performance:
Classification model performance:
Train several models and evaluate performance out of the box
import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model_selection import cross_val_score, KFold, train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier
X = music.drop("genre", axis=1).values y = music["genre"].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
models = {"Logistic Regression": LogisticRegression(), "KNN": KNeighborsClassifier(), "Decision Tree": DecisionTreeClassifier()} results = []
for model in models.values():
kf = KFold(n_splits=6, random_state=42, shuffle=True)
cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
results.append(cv_results)
plt.boxplot(results, labels=models.keys()) plt.show()
for name, model in models.items():
model.fit(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)
print("{} Test Set Accuracy: {}".format(name, test_score))
Logistic Regression Test Set Accuracy: 0.844
KNN Test Set Accuracy: 0.82
Decision Tree Test Set Accuracy: 0.832
Supervised Learning with scikit-learn