Handling missing values

Data Manipulation in Julia

Katerina Zahradova

Instructor

Finding missing values

# Check to see if there are missing values
describe(penguins, :nmissing)
7×2 DataFrame
Row variable           nmissing
    Symbol             Int64
_________________________________
1   species            0
2   island             5
3   culmen_length_mm   0
4   culmen_depth_mm    0
5   flipper_length_mm  0
6   body_mass_g        23
7   sex                0
Data Manipulation in Julia

ismissing()

# Find rows with missing values
penguins[ismissing.(penguins.island),:]
5x7 DataFrame
Row species   island    culmen_length_mm  culmen_depth_mm  ... 
    String15  String15  Float64           Float64          ...
___________________________________________________________________
1   Adelie    missing   39.5              17.4             ...
2   Adelie    missing   40.3              18.0             ...
3   Chinstrip missing   46.7              18.3             ...
4   Gentoo    missing   49.3              13.6             ...
5   Gentoo    missing   43.9              17.8             ...
Data Manipulation in Julia

ismissing()

# Find rows with missing values
penguins[ismissing.(penguins.island), :species, :sex]
5x3 DataFrame
Row species   island    sex
    String15  String15  String7
________________________________
1   Adelie    missing   MALE
2   Adelie    missing   FEMALE
3   Chinstrip missing   MALE
4   Gentoo    missing   MALE
5   Gentoo    missing   FEMALE
Data Manipulation in Julia

dropmissing()

# Drop all missing values
dropmissing!(penguins)

describe(penguins)
7×2 DataFrame
Row variable           nmissing
    Symbol             Int64
1   species            0
2   island             0
3   culmen_length_mm   0
4   culmen_depth_mm    0
5   flipper_length_mm  0
6   body_mass_g        0
7   sex                0
# Drop missing values in island column
dropmissing!(penguins, :island)

describe(penguins)
7×2 DataFrame
Row variable           nmissing
    Symbol             Int64
1   species            0
2   island             0
3   culmen_length_mm   0
4   culmen_depth_mm    0
5   flipper_length_mm  0
6   body_mass_g        23
7   sex                0
Data Manipulation in Julia

replace()

# Replace missing by a value
replace!(penguins.body_mass_g, missing => 0)
# Replace missing by mean
replace!(penguins.body_mass_g, missing => mean(skipmissing(penguins.body_mass_g)))
Data Manipulation in Julia

Replacing with grouped summary statistics

Penguin comparison

1 Image courtesy www.bas.ac.uk/about/antarctica/wildlife/penguins/
Data Manipulation in Julia

Replacing using groupby()

# Iterate over groups and replace by rounded mean for each group
for group in groupby(penguins, :species)
    group[ismissing.(group.body_mass_g), :body_mass_g] .= round(mean(skipmissing(group.body_mass_g)))
end

# Check missing values
describe(penguins, :nmissing)
7×2 DataFrame
Row variable           nmissing
    Symbol             Int64
1   species            0
...
6   body_mass_g        0
7   sex                0
Data Manipulation in Julia

Replacing using multiple columns

# Iterate over more groups and replace by rounded mean for each group
for group in groupby(penguins, [:species, :sex])
    group[ismissing.(group.body_mass_g), :body_mass_g] .= round(mean(skipmissing(group.body_mass_g)))
end
Data Manipulation in Julia

Insufficient data

# What happens if there are no records in the group
for group in groupby(penguins, [:species, :sex, :flipper_length_mm, :culmen_length_mm])
    group[ismissing.(group.body_mass_g), :body_mass_g] .= round(mean(skipmissing(group.body_mass_g)))
end
ArgumentError: median of an empty array is undefined, Any[]
Data Manipulation in Julia

Cheat sheet - find and drop missing values

  • ismissing(var): returns true if var = missing, false otherwise
  • ismissing.(df.col): returns a vector of true/false values
  • df[ismissing.(df.col),:]: returns those rows of df where the value in col is missing
  • dropmissing(df): drops all rows that contain missing
  • dropmissing!(df, :col): drops all rows that contain missing in col; rewrites df
Data Manipulation in Julia

Cheat sheet - replace missing values

  • replace!(df.col, missing => mean(skipmissing(df.col))): replaces missing values in col with the mean of col (calculated by skipping those missing values)
  • To replace missing in individual groups
    for group in groupby(df, :col)
      group[ismissing.(group.col),:col] = value  # or by mean of the group
    end
    
Data Manipulation in Julia

Let's practice!

Data Manipulation in Julia

Preparing Video For Download...