Extreme Gradient Boosting with XGBoost
Sergey Fogelson
Head of Data Science, TelevisaUnivision
import pandas as pd import xgboost as xgb import numpy as np from sklearn.model_selection import GridSearchCV
housing_data = pd.read_csv("ames_housing_trimmed_processed.csv") X, y = housing_data[housing_data.columns.tolist()[:-1]], housing_data[housing_data.columns.tolist()[-1] housing_dmatrix = xgb.DMatrix(data=X,label=y)
gbm_param_grid = {'learning_rate': [0.01,0.1,0.5,0.9], 'n_estimators': [200], 'subsample': [0.3, 0.5, 0.9]}
gbm = xgb.XGBRegressor() grid_mse = GridSearchCV(estimator=gbm,param_grid=gbm_param_grid, scoring='neg_mean_squared_error', cv=4, verbose=1) grid_mse.fit(X, y)
print("Best parameters found: ",grid_mse.best_params_) print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
Best parameters found: {'learning_rate': 0.1,
'n_estimators': 200, 'subsample': 0.5}
Lowest RMSE found: 28530.1829341
import pandas as pd import xgboost as xgb import numpy as np from sklearn.model_selection import RandomizedSearchCV housing_data = pd.read_csv("ames_housing_trimmed_processed.csv") X,y = housing_data[housing_data.columns.tolist()[:-1]], housing_data[housing_data.columns.tolist()[-1]] housing_dmatrix = xgb.DMatrix(data=X,label=y)
gbm_param_grid = {'learning_rate': np.arange(0.05,1.05,.05), 'n_estimators': [200], 'subsample': np.arange(0.05,1.05,.05)}
gbm = xgb.XGBRegressor() randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid, n_iter=25, scoring='neg_mean_squared_error', cv=4, verbose=1) randomized_mse.fit(X, y)
print("Best parameters found: ",randomized_mse.best_params_) print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))
Best parameters found: {'subsample': 0.60000000000000009,
'n_estimators': 200, 'learning_rate': 0.20000000000000001}
Lowest RMSE found: 28300.2374291
Extreme Gradient Boosting with XGBoost