Handling Missing Data with Imputations in R
Michal Oleszak
Machine Learning Engineer
The model for each variable depends on the type of this variable:
Impute Height
and Weight
in nhanes
with a linear model:
library(simputation)
nhanes_imp <- impute_lm(nhanes, Height + Weight ~ .)
Check if they were indeed imputed:
nhanes_imp %>%
is.na() %>%
colSums()
Age Gender Weight Height Diabetes TotChol Pulse PhysActive
0 0 32 30 1 85 32 26
Initialize missing values with hotdeck
and save missing locations:
nhanes_imp <- hotdeck(nhanes)
missing_height <- nhanes_imp$Height_imp
missing_weight <- nhanes_imp$Weight_imp
Iterate over Height
and Weight
5 times, imputing them at originally missing locations:
for (i in 1:5) {
nhanes_imp$Height[missing_height] <- NA
nhanes_imp <- impute_lm(nhanes_imp, Height ~ Age + Gender + Weight)
nhanes_imp$Weight[missing_weight] <- NA
nhanes_imp <- impute_lm(nhanes_imp, Weight ~ Age + Gender + Height)
}
for (i in 1:5) {
nhanes_imp$Height[missing_height] <- NA
nhanes_imp <- impute_lm(nhanes, Height ~ Age + Gender + Weight)
nhanes_imp$Weight[missing_weight] <- NA
nhanes_imp <- impute_lm(nhanes, Weight ~ Age + Gender + Height)
}
diff_height <- c()
diff_weight <- c()
for (i in 1:5) {
nhanes_imp$Height[missing_height] <- NA
nhanes_imp <- impute_lm(nhanes, Height ~ Age + Gender + Weight)
nhanes_imp$Weight[missing_weight] <- NA
nhanes_imp <- impute_lm(nhanes, Weight ~ Age + Gender + Height)
}
diff_height <- c()
diff_weight <- c()
for (i in 1:5) {
prev_iter <- nhanes_imp
nhanes_imp$Height[missing_height] <- NA
nhanes_imp <- impute_lm(nhanes, Height ~ Age + Gender + Weight)
nhanes_imp$Weight[missing_weight] <- NA
nhanes_imp <- impute_lm(nhanes, Weight ~ Age + Gender + Height)
}
diff_height <- c()
diff_weight <- c()
for (i in 1:5) {
prev_iter <- nhanes_imp
nhanes_imp$Height[missing_height] <- NA
nhanes_imp <- impute_lm(nhanes, Height ~ Age + Gender + Weight)
nhanes_imp$Weight[missing_weight] <- NA
nhanes_imp <- impute_lm(nhanes, Weight ~ Age + Gender + Height)
diff_height <- c(diff_height, mapc(prev_iter$Height, nhanes_imp$Height))
diff_weight <- c(diff_weight, mapc(prev_iter$Weight, nhanes_imp$Weight))
}
Handling Missing Data with Imputations in R