Dealing with Missing Data in Python
Suraj Donthi
Deep Learning & Computer Vision Consultant
Color | Color_Red | Color_Green | Color_Blue |
---|---|---|---|
Red | 1 | 0 | 0 |
Green | 0 | 1 | 0 |
Blue | 0 | 0 | 1 |
Red | 1 | 0 | 0 |
Blue | 0 | 0 | 1 |
Blue | 0 | 0 | 1 |
Color | Value |
---|---|
Red | 0 |
Green | 1 |
Blue | 2 |
Red | 0 |
Blue | 2 |
Blue | 2 |
users = pd.read_csv('userprofile.csv')
users.head()
smoker drink_level dress_preference ambience hijos activity budget
0 False abstemious informal family independent student medium
1 False abstemious informal family independent student low
2 False social drinker formal family independent student low
3 False abstemious informal family independent professional medium
4 False abstemious no preference family independent student medium
from sklearn.preprocessing import OrdinalEncoder
# Create Ordinal Encoder ambience_ord_enc = OrdinalEncoder() # Select non-null values in ambience ambience = users['ambience'] ambience_not_null = ambience[ambience.notnull()] reshaped_vals = ambience_not_null.values.reshape(-1, 1)
# Encode the non-null values of ambience encoded_vals = ambience_ord_enc.fit_transform(reshaped_vals)
# Replace the ambience column with ordinal values users.loc[ambience.notnull(), 'ambience'] = np.squeeze(encoded_vals)
# Create dictionary for Ordinal encoders
ordinal_enc_dict = {}
# Loop over columns to encode
for col_name in users:
# Create ordinal encoder for the column
ordinal_enc_dict[col_name] = OrdinalEncoder()
col = users[col_name]
# Select the non-null values in the column
col_not_null = col[col.notnull()]
reshaped_vals = col_not_null.values.reshape(-1, 1)
# Encode the non-null values of the column
encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
# Replace the values in the column with ordinal values
users.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
users_KNN_imputed = users.copy(deep=True)
# Create KNN imputer KNN_imputer = KNN()
users_KNN_imputed.iloc[:, :] = np.round(KNN_imputer.fit_transform(users))
for col_name in users_KNN_imputed: # Reshape the values to 2-dimensions to # avoid errors while storing in the DataFrame reshaped = users_KNN_imputed[col_name].values.reshape(-1, 1) users_KNN_imputed[col_name] = \ ordinal_enc_dict[col_name].inverse_transform(reshaped)
Steps to impute categorical values
Dealing with Missing Data in Python