Winning a Kaggle Competition in Python
Yauhen Babakhin
Kaggle Grandmaster
# Read data
taxi_train = pd.read_csv('taxi_train.csv')
taxi_test = pd.read_csv('taxi_test.csv')
from sklearn.model_selection import train_test_split
# Create local validation
validation_train, validation_test = train_test_split(taxi_train,
test_size=0.3,
random_state=123)
import numpy as np # Assign the mean fare amount to all the test observations taxi_test['fare_amount'] = np.mean(taxi_train.fare_amount)
# Write predictions to the file taxi_test[['id','fare_amount']].to_csv('mean_sub.csv', index=False)
Validation RMSE | Public LB RMSE | Public LB Position |
---|---|---|
9.986 | 9.409 | 1449 / 1500 |
# Calculate the mean fare amount by group
naive_prediction_groups = taxi_train.groupby('passenger_count').fare_amount.mean()
# Make predictions on the test set taxi_test['fare_amount'] = taxi_test.passenger_count.map(naive_prediction_groups)
# Write predictions to the file taxi_test[['id','fare_amount']].to_csv('mean_group_sub.csv', index=False)
Validation RMSE | Public LB RMSE | Public LB Position |
---|---|---|
9.978 | 9.407 | 1411 / 1500 |
# Select only numeric features
features = ['pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
from sklearn.ensemble import GradientBoostingRegressor # Train a Gradient Boosting model gb = GradientBoostingRegressor() gb.fit(taxi_train[features], taxi_train.fare_amount)
# Make predictions on the test data taxi_test['fare_amount'] = gb.predict(taxi_test[features])
# Write predictions to the file
taxi_test[['id','fare_amount']].to_csv('gb_sub.csv', index=False)
Validation RMSE | Public LB RMSE | Public LB Position |
---|---|---|
5.996 | 4.595 | 1109 / 1500 |
Model | Validation RMSE | Public LB RMSE |
---|---|---|
Simple Mean | 9.986 | 9.409 |
Group Mean | 9.978 | 9.407 |
Gradient Boosting | 5.996 | 4.595 |
Model | Validation RMSE | Public LB RMSE |
---|---|---|
Model A | 3.500 | 3.800 |
Model B | 3.300 | 4.100 |
Model C | 3.200 | 3.900 |
Model | Validation RMSE | Public LB RMSE |
---|---|---|
Model A | 3.400 | 3.900 |
Model B | 3.100 | 3.400 |
Model C | 2.900 | 3.300 |
Winning a Kaggle Competition in Python