Winning a Kaggle Competition in Python
Yauhen Babakhin
Kaggle Grandmaster
# Read train data
taxi_train = pd.read_csv('taxi_train.csv')
taxi_train.columns.to_list()
['key',
'fare_amount',
'pickup_datetime',
'pickup_longitude',
'pickup_latitude',
'dropoff_longitude',
'dropoff_latitude',
'passenger_count']
import matplotlib.pyplot as plt
# Plot a histogram
taxi_train.fare_amount.hist(bins=30, alpha=0.5)
plt.show()
from sklearn.linear_model import LinearRegression
# Create a LinearRegression object
lr = LinearRegression()
# Fit the model on the train data
lr.fit(X=taxi_train[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
'dropoff_latitude', 'passenger_count']],
y=taxi_train['fare_amount'])
# Select features
features = ['pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude',
'passenger_count']
# Make predictions on the test data
taxi_test['fare_amount'] = lr.predict(taxi_test[features])
# Read a sample submission file
taxi_sample_sub = pd.read_csv('taxi_sample_submission.csv')
taxi_sample_sub.head(1)
key fare_amount
0 2015-01-27 13:08:24.0000002 11.35
# Prepare a submission file
taxi_submission = taxi_test[['key', 'fare_amount']]
# Save the submission file as .csv
taxi_submission.to_csv('first_sub.csv', index=False)
Winning a Kaggle Competition in Python