Extreme Gradient Boosting with XGBoost
Sergey Fogelson
Head of Data Science, TelevisaUnivision
import pandas as pd ...: import xgboost as xgb ...: import numpy as np ...: from sklearn.preprocessing import StandardScaler ...: from sklearn.pipeline import Pipeline ...: from sklearn.model_selection import RandomizedSearchCV
names = ["crime","zone","industry","charles","no", ...: "rooms","age", "distance","radial","tax", ...: "pupil","aam","lower","med_price"] data = pd.read_csv("boston_housing.csv",names=names) X, y = data.iloc[:,:-1],data.iloc[:,-1] xgb_pipeline = Pipeline[("st_scaler", ...: StandardScaler()), ("xgb_model",xgb.XGBRegressor())]
gbm_param_grid = { ...: 'xgb_model__subsample': np.arange(.05, 1, .05), ...: 'xgb_model__max_depth': np.arange(3,20,1), ...: 'xgb_model__colsample_bytree': np.arange(.1,1.05,.05) }
randomized_neg_mse = RandomizedSearchCV(estimator=xgb_pipeline, ...: param_distributions=gbm_param_grid, n_iter=10, ...: scoring='neg_mean_squared_error', cv=4)
randomized_neg_mse.fit(X, y)
print("Best rmse: ", np.sqrt(np.abs(randomized_neg_mse.best_score_)))
Best rmse: 3.9966784203040677
print("Best model: ", randomized_neg_mse.best_estimator_)
Best model: Pipeline(steps=[('st_scaler', StandardScaler(copy=True,
with_mean=True, with_std=True)),
('xgb_model', XGBRegressor(base_score=0.5, colsample_bylevel=1,
colsample_bytree=0.95000000000000029, gamma=0, learning_rate=0.1,
max_delta_step=0, max_depth=8, min_child_weight=1, missing=None,
n_estimators=100, nthread=-1, objective='reg:squarederror', reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
subsample=0.90000000000000013))])
Extreme Gradient Boosting with XGBoost