Sampling in R
Richie Cotton
Data Evangelist at DataCamp
top_counted_countries <- c(
"Mexico", "Colombia", "Guatemala",
"Brazil", "Taiwan", "United States (Hawaii)"
)
coffee_ratings_top <- coffee_ratings %>%
filter(country_of_origin %in% top_counted_countries)
coffee_ratings_srs <- coffee_ratings_top %>%
slice_sample(prop = 1 / 3)
coffee_ratings_strat <- coffee_ratings_top %>%
group_by(country_of_origin) %>%
slice_sample(prop = 1 / 3) %>%
ungroup()
top_countries_samp <- sample(top_counted_countries, size = 2)
coffee_ratings_clust <- coffee_ratings_top %>%
filter(country_of_origin %in% top_countries_samp) %>%
group_by(country_of_origin) %>%
slice_sample(n = nrow(coffee_ratings_top) / 6) %>%
ungroup()
slice_sample(n = floor(nrow(coffee_ratings_top) / 6))
coffee_ratings_top %>%
summarize(mean_points = mean(total_cup_points))
81.9
coffee_ratings_srs %>%
summarize(mean_points = mean(total_cup_points))
82.0
coffee_ratings_strat %>%
summarize(mean_points = mean(total_cup_points))
81.8
coffee_ratings_clust %>%
summarize(mean_points = mean(total_cup_points))
81.2
Population
coffee_ratings_top %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# A tibble: 6 x 2
country_of_origin mean_points
<chr> <dbl>
1 Brazil 82.4
2 Colombia 83.1
3 Guatemala 81.8
4 Mexico 80.9
5 Taiwan 82.0
6 United States (Hawaii) 81.8
Simple random sample
coffee_ratings_srs %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# A tibble: 6 x 2
country_of_origin mean_points
<chr> <dbl>
1 Brazil 82.3
2 Colombia 83.1
3 Guatemala 81.5
4 Mexico 81.1
5 Taiwan 82.8
6 United States (Hawaii) 82.7
Population
coffee_ratings_top %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# A tibble: 6 x 2
country_of_origin mean_points
<chr> <dbl>
1 Brazil 82.4
2 Colombia 83.1
3 Guatemala 81.8
4 Mexico 80.9
5 Taiwan 82.0
6 United States (Hawaii) 81.8
Stratified sample
coffee_ratings_strat %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# A tibble: 6 x 2
country_of_origin mean_points
<chr> <dbl>
1 Brazil 82.4
2 Colombia 82.9
3 Guatemala 81.7
4 Mexico 80.7
5 Taiwan 82.3
6 United States (Hawaii) 81.2
Population
coffee_ratings_top %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# A tibble: 6 x 2
country_of_origin mean_points
<chr> <dbl>
1 Brazil 82.4
2 Colombia 83.1
3 Guatemala 81.8
4 Mexico 80.9
5 Taiwan 82.0
6 United States (Hawaii) 81.8
Cluster sample
coffee_ratings_clust %>%
group_by(country_of_origin) %>%
summarize(mean_points = mean(total_cup_points))
# A tibble: 2 x 2
country_of_origin mean_points
<chr> <dbl>
1 Mexico 80.8
2 Taiwan 82.0
Sampling in R