Dealing with Missing Data in Python
Suraj Donthi
Deep Learning & Computer Vision Consultant
from sklearn.impute import SimpleImputer
diabetes_mean = diabetes.copy(deep=True)
mean_imputer = SimpleImputer(strategy='mean')
from sklearn.impute import SimpleImputer
diabetes_mean = diabetes.copy(deep=True)
mean_imputer = SimpleImputer(strategy='mean')
diabetes_mean.iloc[:, :] = mean_imputer.fit_transform(diabetes_mean)
diabetes_median = diabetes.copy(deep=True)
median_imputer = SimpleImputer(strategy='median')
diabetes_median.iloc[:, :] = median_imputer.fit_transform(diabetes_median)
diabetes_mode = diabetes.copy(deep=True)
mode_imputer = SimpleImputer(strategy='most_frequent')
diabetes_mode.iloc[:, :] = mode_imputer.fit_transform(diabetes_mode)
diabetes_constant = diabetes.copy(deep=True)
constant_imputer = SimpleImputer(strategy='constant', fill_value=0))
diabetes_constant.iloc[:, :] = constant_imputer.fit_transform(diabetes_constant)
nullity = diabetes['Serum_Insulin'].isnull()+diabetes['Glucose'].isnull()
diabetes_mean.plot(x='Serum_Insulin', y='Glucose', kind='scatter', alpha=0.5,
c=nullity, cmap='rainbow', title='Mean Imputation')
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))
nullity = diabetes['Serum_Insulin'].isnull()+diabetes['Glucose'].isnull()
imputations = {'Mean Imputation': diabetes_mean, 'Median Imputation': diabetes_median, 'Most Frequent Imputation': diabetes_mode, 'Constant Imputation': diabetes_constant}
for ax, df_key in zip(axes.flatten(), imputations):
imputations[df_key].plot(x='Serum_Insulin', y='Glucose', kind='scatter', alpha=0.5, c=nullity, cmap='rainbow', ax=ax, colorbar=False, title=df_key)
You learned to
Dealing with Missing Data in Python