Comparing sampling methods

Sampling in R

Richie Cotton

Data Evangelist at DataCamp

Review of sampling techniques

Setup

top_counted_countries <- c(
  "Mexico", "Colombia", "Guatemala",
  "Brazil", "Taiwan", "United States (Hawaii)"
)
coffee_ratings_top <- coffee_ratings %>%
  filter(country_of_origin %in% top_counted_countries)

Simple random sampling

coffee_ratings_srs <- coffee_ratings_top %>% 
  slice_sample(prop = 1 / 3)

Stratified sampling

coffee_ratings_strat <- coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  slice_sample(prop = 1 / 3) %>% 
  ungroup()

Cluster sampling

top_countries_samp <- sample(top_counted_countries, size = 2)
coffee_ratings_clust <- coffee_ratings_top %>% 
  filter(country_of_origin %in% top_countries_samp) %>% 
  group_by(country_of_origin) %>% 
  slice_sample(n = nrow(coffee_ratings_top) / 6) %>% 
  ungroup()

slice_sample(n = floor(nrow(coffee_ratings_top) / 6))

Calculating mean cup points

Population

coffee_ratings_top %>% 
  summarize(mean_points = mean(total_cup_points))

81.9

Simple random sample

coffee_ratings_srs %>% 
  summarize(mean_points = mean(total_cup_points))

82.0

Stratified sample

coffee_ratings_strat %>% 
  summarize(mean_points = mean(total_cup_points))

81.8

Cluster sample

coffee_ratings_clust %>% 
  summarize(mean_points = mean(total_cup_points))

81.2

Mean cup points by country: simple random

Population

coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# A tibble: 6 x 2
  country_of_origin      mean_points
  <chr>                        <dbl>
1 Brazil                        82.4
2 Colombia                      83.1
3 Guatemala                     81.8
4 Mexico                        80.9
5 Taiwan                        82.0
6 United States (Hawaii)        81.8

Simple random sample

coffee_ratings_srs %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# A tibble: 6 x 2
  country_of_origin      mean_points
  <chr>                        <dbl>
1 Brazil                        82.3
2 Colombia                      83.1
3 Guatemala                     81.5
4 Mexico                        81.1
5 Taiwan                        82.8
6 United States (Hawaii)        82.7

Mean cup points by country: stratified

Population

coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# A tibble: 6 x 2
  country_of_origin      mean_points
  <chr>                        <dbl>
1 Brazil                        82.4
2 Colombia                      83.1
3 Guatemala                     81.8
4 Mexico                        80.9
5 Taiwan                        82.0
6 United States (Hawaii)        81.8

Stratified sample

coffee_ratings_strat %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# A tibble: 6 x 2
  country_of_origin      mean_points
  <chr>                        <dbl>
1 Brazil                        82.4
2 Colombia                      82.9
3 Guatemala                     81.7
4 Mexico                        80.7
5 Taiwan                        82.3
6 United States (Hawaii)        81.2

Mean cup points by country: cluster

Population

coffee_ratings_top %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# A tibble: 6 x 2
  country_of_origin      mean_points
  <chr>                        <dbl>
1 Brazil                        82.4
2 Colombia                      83.1
3 Guatemala                     81.8
4 Mexico                        80.9
5 Taiwan                        82.0
6 United States (Hawaii)        81.8

Cluster sample

coffee_ratings_clust %>% 
  group_by(country_of_origin) %>% 
  summarize(mean_points = mean(total_cup_points))

# A tibble: 2 x 2
  country_of_origin mean_points
  <chr>                   <dbl>
1 Mexico                   80.8
2 Taiwan                   82.0

Let's practice!

Sampling in R