Data Manipulation in Julia
Katerina Zahradova
Instructor
# Check to see if there are missing values
describe(penguins, :nmissing)
7×2 DataFrame
Row variable nmissing
Symbol Int64
_________________________________
1 species 0
2 island 5
3 culmen_length_mm 0
4 culmen_depth_mm 0
5 flipper_length_mm 0
6 body_mass_g 23
7 sex 0
# Find rows with missing values
penguins[ismissing.(penguins.island),:]
5x7 DataFrame
Row species island culmen_length_mm culmen_depth_mm ...
String15 String15 Float64 Float64 ...
___________________________________________________________________
1 Adelie missing 39.5 17.4 ...
2 Adelie missing 40.3 18.0 ...
3 Chinstrip missing 46.7 18.3 ...
4 Gentoo missing 49.3 13.6 ...
5 Gentoo missing 43.9 17.8 ...
# Find rows with missing values
penguins[ismissing.(penguins.island), :species, :sex]
5x3 DataFrame
Row species island sex
String15 String15 String7
________________________________
1 Adelie missing MALE
2 Adelie missing FEMALE
3 Chinstrip missing MALE
4 Gentoo missing MALE
5 Gentoo missing FEMALE
# Drop all missing values
dropmissing!(penguins)
describe(penguins)
7×2 DataFrame
Row variable nmissing
Symbol Int64
1 species 0
2 island 0
3 culmen_length_mm 0
4 culmen_depth_mm 0
5 flipper_length_mm 0
6 body_mass_g 0
7 sex 0
# Drop missing values in island column
dropmissing!(penguins, :island)
describe(penguins)
7×2 DataFrame
Row variable nmissing
Symbol Int64
1 species 0
2 island 0
3 culmen_length_mm 0
4 culmen_depth_mm 0
5 flipper_length_mm 0
6 body_mass_g 23
7 sex 0
# Replace missing by a value
replace!(penguins.body_mass_g, missing => 0)
# Replace missing by mean
replace!(penguins.body_mass_g, missing => mean(skipmissing(penguins.body_mass_g)))
# Iterate over groups and replace by rounded mean for each group
for group in groupby(penguins, :species)
group[ismissing.(group.body_mass_g), :body_mass_g] .= round(mean(skipmissing(group.body_mass_g)))
end
# Check missing values
describe(penguins, :nmissing)
7×2 DataFrame
Row variable nmissing
Symbol Int64
1 species 0
...
6 body_mass_g 0
7 sex 0
# Iterate over more groups and replace by rounded mean for each group
for group in groupby(penguins, [:species, :sex])
group[ismissing.(group.body_mass_g), :body_mass_g] .= round(mean(skipmissing(group.body_mass_g)))
end
# What happens if there are no records in the group
for group in groupby(penguins, [:species, :sex, :flipper_length_mm, :culmen_length_mm])
group[ismissing.(group.body_mass_g), :body_mass_g] .= round(mean(skipmissing(group.body_mass_g)))
end
ArgumentError: median of an empty array is undefined, Any[]
ismissing(var)
: returns true
if var = missing
, false
otherwiseismissing.(df.col)
: returns a vector of true
/false
valuesdf[ismissing.(df.col),:]
: returns those rows of df
where the value in col
is missing
dropmissing(df)
: drops all rows that contain missing
dropmissing!(df, :col)
: drops all rows that contain missing
in col
; rewrites df
replace!(df.col, missing => mean(skipmissing(df.col)))
: replaces missing
values in col
with the mean of col
(calculated by skipping those missing values) missing
in individual groupsfor group in groupby(df, :col)
group[ismissing.(group.col),:col] = value # or by mean of the group
end
Data Manipulation in Julia