Dealing with Missing Data in Python
Suraj Donthi
Deep Learning & Computer Vision Consultant
from numpy.random import rand
BMI_null = diabetes['BMI'].isnull() num_nulls = BMI_null.sum()
# Generate random values dummy_values = rand(num_nulls)
from numpy.random import rand
BMI_null = diabetes['BMI'].isnull()
num_nulls = BMI_null.sum()
# Generate random values
dummy_values = rand(num_nulls)
# Shift to -2 & -1
dummy_values = dummy_values - 2
from numpy.random import rand
BMI_null = diabetes['BMI'].isnull()
num_nulls = BMI_null.sum()
# Generate random values
dummy_values = rand(num_nulls)
# Shift to -2 & -1
dummy_values = dummy_values - 2
# Scale to 0.075 of Column Range
BMI_range = BMI.max() - BMI.min()
dummy_values = dummy_values * 0.075 * BMI_range
from numpy.random import rand
BMI_null = diabetes['BMI'].isnull()
num_nulls = BMI_null.sum()
# Generate random values
dummy_values = rand(num_nulls)
# Shift to -2 & -1
dummy_values = dummy_values - 2
# Scale to 0.075 of Column Range
BMI_range = BMI.max() - BMI.min()
dummy_values = dummy_values * 0.075 * BMI_range
# Shift to Column Minimum
dummy_values = (rand(num_nulls) - 2)
* 0.075 * BMI_range + BMI.min()
from numpy.random import rand
def fill_dummy_values(df, scaling_factor):
# Create copy of dataframe
df_dummy = df.copy(deep=True)
# Iterate over each column
for col in df_dummy:
# Get column, column missing values and range
col = df_dummy[col]
col_null = col.isnull()
num_nulls = col_null.sum()
col_range = col.max() - col.min()
# Shift and scale dummy values
dummy_values = (rand(num_nulls) - 2)
dummy_values = dummy_values * scaling_factor * col_range + col.min()
# Return dummy values
col[col_null] = dummy_values
return df_dummy
# Create dummy dataframe diabetes_dummy = fill_dummy_values(diabetes)
# Get missing values of both columns for coloring nullity=diabetes.Serum_Insulin.isnull() | diabetes.BMI.isnull() # Generate scatter plot diabetes_dummy.plot(x='Serum_Insulin', y='BMI', kind='scatter', alpha=0.5,
c=nullity, cmap='rainbow')
Note: Sum (+) is equivalent to logical OR (|) for boolean values.
Dealing with Missing Data in Python