Intermediate Regression with statsmodels in Python
Maarten Van den Broeck
Content Developer at DataCamp
print(fish["species"].unique())
array(['Bream', 'Roach', 'Perch', 'Pike'], dtype=object)
bream = fish[fish["species"] == "Bream"]
perch = fish[fish["species"] == "Perch"]
pike = fish[fish["species"] == "Pike"]
roach = fish[fish["species"] == "Roach"]
mdl_bream = ols("mass_g ~ length_cm", data=bream).fit()
print(mdl_bream.params)
Intercept -1035.3476
length_cm 54.5500
mdl_perch = ols("mass_g ~ length_cm", data=perch).fit()
print(mdl_perch.params)
Intercept -619.1751
length_cm 38.9115
mdl_pike = ols("mass_g ~ length_cm", data=pike).fit()
print(mdl_pike.params)
Intercept -1540.8243
length_cm 53.1949
mdl_roach = ols("mass_g ~ length_cm", data=roach).fit()
print(mdl_roach.params)
Intercept -329.3762
length_cm 23.3193
explanatory_data = pd.DataFrame(
{"length_cm": np.arange(5, 61, 5)})
print(explanatory_data)
length_cm
0 5
1 10
2 15
3 20
4 25
5 30
6 35
7 40
8 45
9 50
10 55
11 60
prediction_data_bream = explanatory_data.assign(
mass_g = mdl_bream.predict(explanatory_data),
species = "Bream")
prediction_data_perch = explanatory_data.assign(
mass_g = mdl_perch.predict(explanatory_data),
species = "Perch")
prediction_data_pike = explanatory_data.assign(
mass_g = mdl_pike.predict(explanatory_data),
species = "Pike")
prediction_data_roach = explanatory_data.assign(
mass_g = mdl_roach.predict(explanatory_data),
species = "Roach")
prediction_data = pd.concat([prediction_data_bream,
prediction_data_roach,
prediction_data_perch,
prediction_data_pike])
length_cm mass_g species
0 5 -762.597660 Bream
1 10 -489.847756 Bream
2 15 -217.097851 Bream
3 20 55.652054 Bream
4 25 328.401958 Bream
5 30 601.151863 Bream
...
3 20 -476.926955 Pike
4 25 -210.952626 Pike
5 30 55.021703 Pike
6 35 320.996032 Pike
7 40 586.970362 Pike
8 45 852.944691 Pike
9 50 1118.919020 Pike
10 55 1384.893349 Pike
11 60 1650.867679 Pike
sns.lmplot(x="length_cm", y="mass_g", data=fish, hue="species", ci=None)
plt.show()
sns.lmplot(x="length_cm",
y="mass_g",
data=fish,
hue="species",
ci=None)
sns.scatterplot(x="length_cm",
y="mass_g",
data=prediction_data,
hue="species",
ci=None,
legend=False)
plt.show()
mdl_fish = ols("mass_g ~ length_cm + species",
data=fish).fit()
print(mdl_fish.rsquared_adj)
0.917
print(mdl_bream.rsquared_adj)
0.874
print(mdl_perch.rsquared_adj)
0.917
print(mdl_pike.rsquared_adj)
0.941
print(mdl_roach.rsquared_adj)
0.815
print(np.sqrt(mdl_fish.mse_resid))
103
print(np.sqrt(mdl_bream.mse_resid))
74.2
print(np.sqrt(mdl_perch.mse_resid))
100
print(np.sqrt(mdl_pike.mse_resid))
120
print(np.sqrt(mdl_roach.mse_resid))
38.2
Intermediate Regression with statsmodels in Python