Dealing With Missing Data in R
Nicholas Tierney
Statistician
simputation
packageimpute_lm
df
# A tibble: 5 x 3
y x1 x2
<dbl> <dbl> <dbl>
1 2.67 2.43 3.27
2 3.87 3.55 1.45
3 NA 2.90 1.49
4 5.21 2.72 1.84
5 NA 4.29 1.15
df %>%
bind_shadow(only_miss = TRUE) %>%
add_label_shadow() %>%
impute_lm(y ~ x1 + x2)
# A tibble: 5 x 7
y x1 x2 y_NA any_missing
<dbl> <dbl> <dbl> <fct> <chr>
1 2.67 2.43 3.27 !NA Not Missing
2 3.87 3.55 1.45 !NA Not Missing
3 5.54 2.90 1.49 NA Missing
4 5.21 2.72 1.84 !NA Not Missing
5 2.56 4.29 1.15 NA Missing
aq_imp_lm <- airquality %>% bind_shadow() %>% add_label_shadow() %>% impute_lm(Solar.R ~ Wind + Temp + Month) %>% impute_lm(Ozone ~ Wind + Temp + Month)
aq_imp_lm
# A tibble: 153 x 13
Ozone Solar.R Wind Temp Month Day Ozone_NA Solar.R_NA
* <dbl> <dbl> <dbl> <int> <int> <int> <fct> <fct>
1 41 190 7.4 67 5 1 !NA !NA
2 36 118 8 72 5 2 !NA !NA
3 12 149 12.6 74 5 3 !NA !NA
4 18 313 11.5 62 5 4 !NA !NA
5 -9.04 138. 14.3 56 5 5 NA NA
6 28 178. 14.9 66 5 6 !NA NA
# ... with 147 more rows, and 5 more variables: Wind_NA <fct>,
# Temp_NA <fct>, Month_NA <fct>, Day_NA <fct>,
# any_missing <chr>
aq_imp_lm <- airquality %>% bind_shadow() %>% add_label_missings() %>% impute_lm(Solar.R ~ Wind + Temp + Month) %>% impute_lm(Ozone ~ Wind + Temp + Month)
ggplot(aq_imp_lm, aes(x = Solar.R, y = Ozone, color = any_missing)) + geom_point()
aq_imp_small <- airquality %>%
bind_shadow() %>%
impute_lm(Ozone ~ Wind + Temp) %>%
impute_lm(Solar.R ~ Wind + Temp) %>%
add_label_shadow()
aq_imp_large <- airquality %>%
bind_shadow() %>%
impute_lm(Ozone ~ Wind + Temp + Month + Day) %>%
impute_lm(Solar.R ~ Wind + Temp + Month + Day) %>%
add_label_shadow()
bound_models <- bind_rows(small = aq_imp_small,
large = aq_imp_large,
.id = "imp_model")
bound_models
imp_model Ozone Solar.R Wind Temp Month Day
1: small 41.00000 190.0000 7.4 67 5 1
2: small 36.00000 118.0000 8.0 72 5 2
3: small 12.00000 149.0000 12.6 74 5 3
...
304: large 14.00000 191.0000 14.3 75 9 28
305: large 18.00000 131.0000 8.0 76 9 29
306: large 20.00000 223.0000 11.5 68 9 30
ggplot(bound_models,
aes(x = Ozone,
y = Solar.R,
color = any_missing)) +
geom_point() +
facet_wrap(~ imp_model)
bound_models_gather <- bound_models %>%
select(Ozone, Solar.R, any_missing, imp_model) %>%
gather(key = "variable", value = "value", -any_missing, -imp_model)
bound_models_gather
any_missing imp_model variable value
1: Not Missing small Ozone 41.00000
2: Not Missing small Ozone 36.00000
3: Not Missing small Ozone 12.00000
4: Not Missing small Ozone 18.00000
5: Missing small Ozone -11.67673
...
608: Not Missing large Solar.R 193.00000
609: Missing large Solar.R 145.00000
610: Not Missing large Solar.R 191.00000
611: Not Missing large Solar.R 131.00000
612: Not Missing large Solar.R 223.00000
ggplot(bound_models_gather,
aes(x = imp_model,
y = value)) +
geom_boxplot() +
facet_wrap(~ key)
bound_models_gather %>%
filter(any_missing == "Missing") %>%
ggplot(aes(x = imp_model,
y = value)) +
geom_boxplot() +
facet_wrap(~ key)
Dealing With Missing Data in R