Cleaning Data in R
Maggie Matsui
Content Developer @ DataCamp
Inconsistency within a category
Too many categories
animals
# A tibble: 68 x 9
animal_name hair eggs fins legs tail type
<chr> <fct> <fct> <fct> <int> <fct> <fct>
1 mole 1 0 0 4 1 mammal
2 chicken 0 1 0 2 1 bird
3 capybara 1 0 0 2 1 Mammal
4 tuna 0 1 1 0 1 fish
5 ostrich 0 1 0 2 1 bird
# ... with 63 more rows
animals %>%
count(type)
"mammal"
" mammal "
"MAMMAL"
"Mammal "
type n
1 " mammal " 1
2 "amphibian" 2
3 "bird" 20
4 "bug" 1
5 "fish" 2
6 "invertebrate" 1
7 "mammal" 38
8 "MAMMAL" 1
9 "Mammal " 1
10 "reptile" 1
library(stringr)
animals %>%
mutate(type_lower = str_to_lower(type))
animal_name hair eggs fins legs tail type type_lower
<fct> <int> <int> <int> <int> <int> <fct> <chr>
1 mole 1 0 0 4 1 "mammal" "mammal"
2 chicken 0 1 0 2 1 "bird" "bird"
3 capybara 1 0 0 2 1 " Mammal" " mammal"
4 tuna 0 1 1 0 1 "fish" "fish"
5 ostrich 0 1 0 2 1 "bird" "bird"
animals %>%
mutate(type_lower = str_to_lower(type)) %>%
count(type_lower)
type_lower n type_lower n
<chr> <int> <chr> <int>
1 " mammal " 1 6 "invertebrate" 1
2 "amphibian" 2 7 "mammal" 39
3 "bird" 20 8 "mammal " 1
4 "bug" 1 9 "reptile" 1
5 "fish" 2
"MAMMAL"
$\rightarrow$ "mammal"
animals %>%
mutate(type_upper = str_to_upper(type)) %>%
count(type_upper)
type_upper n type_upper n
<chr> <int> <chr> <int>
1 " MAMMAL " 1 6 "INVERTEBRATE" 1
2 "AMPHIBIAN" 2 7 "MAMMAL" 39
3 "BIRD" 20 8 "MAMMAL " 1
4 "BUG" 1 9 "REPTILE" 1
5 "FISH" 2
animals %>%
mutate(type_trimmed = str_trim(type_lower))
animal_name hair eggs fins legs tail type_lower type_trimmed
<fct> <int> <int> <int> <int> <int> <chr> <chr>
1 mole 1 0 0 4 1 "mammal" mammal
2 chicken 0 1 0 2 1 "bird" bird
3 capybara 1 0 0 2 1 " mammal" mammal
4 tuna 0 1 1 0 1 "fish" fish
5 ostrich 0 1 0 2 1 "bird" bird
animals %>%
mutate(type_trimmed = str_trim(type_lower)) %>%
count(type_trimmed)
type_trimmed n type_trimmed n
<chr> <int> <chr> <int>
1 amphibian 2 6 mammal 41
2 bird 20 7 reptile 1
3 bug 1
4 fish 2
5 invertebrate 1
animals %>%
count(type_trimmed, sort = TRUE)
type_trimmed n
1 mammal 41
2 bird 20
3 amphibian 2
4 fish 2
5 bug 1
6 invertebrate 1
7 reptile 1
other_categories = c("amphibian", "fish", "bug", "invertebrate", "reptile")
library(forcats)
animals %>%
mutate(type_collapsed = fct_collapse(type_trimmed, other = other_categories))
animal_name hair eggs fins legs tail type_trimmed type_collapsed
<fct> <int> <int> <int> <int> <int> <chr> <chr>
1 mole 1 0 0 4 1 mammal mammal
2 chicken 0 1 0 2 1 bird bird
3 capybara 1 0 0 2 1 mammal mammal
4 tuna 0 1 1 0 1 fish other
5 ostrich 0 1 0 2 1 bird bird
animals %>%
count(type_collapsed)
type_collapsed n
<fct> <int>
1 other 7
2 bird 20
3 mammal 41
animals %>%
group_by(type_collapsed) %>%
summarize(avg_legs = mean(legs))
type_collapsed avg_legs
<fct> <dbl>
1 other 3.71
2 bird 2
3 mammal 3.37
Cleaning Data in R