Inference for Categorical Data in R
Andrew Bray
Assistant Professor of Statistics at Reed College
Conclusion: the true proportion of Americans that are happy is between 0.705 and 0.841.
What do we mean by confident?
ds1 <- filter(gss, year == 2016)
p_hat <- ds1 %>% summarize(mean(happy == "HAPPY")) %>% pull()
SE <- ds1 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7073114 0.8393553
ds2 <- filter(gss, year == 2014)
p_hat <- ds1 %>% summarize(mean(happy == "HAPPY")) %>% pull()
SE <- ds1 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.8348831 0.9384503
ds3 <- filter(gss, year == 2012)
p_hat <- ds1 %>% summarize(mean(happy == "HAPPY")) %>% pull()
SE <- ds1 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7626359 0.8906974
ds3 <- filter(gss, year == 2012) p_hat <- ds3 %>% summarize(mean(happy == "HAPPY")) %>% pull() SE <- ds3 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7626359 0.8906974
ds3 <- filter(gss, year == 2012) p_hat <- ds3 %>% summarize(mean(happy == "HAPPY")) %>% pull() SE <- ds3 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7626359 0.8906974
ds3 <- filter(gss, year == 2012) p_hat <- ds3 %>% summarize(mean(happy == "HAPPY")) %>% pull() SE <- ds3 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7626359 0.8906974
ds3 <- filter(gss, year == 2012) p_hat <- ds3 %>% summarize(mean(happy == "HAPPY")) %>% pull() SE <- ds3 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7626359 0.8906974
ds3 <- filter(gss, year == 2012) p_hat <- ds3 %>% summarize(mean(happy == "HAPPY")) %>% pull() SE <- ds3 %>% specify(response = happy, success = "HAPPY") %>% generate(reps = 500, type = "bootstrap") %>% calculate(stat = "prop") %>% summarize(sd(stat)) %>% pull()
c(p_hat - 2 * SE, p_hat + 2 * SE)
0.7626359 0.8906974
Interpretation: “We’re 95% confident that the true proportion of Americans that are happy is between 0.705 and 0.841.”
Width of the interval affected by
n
p
Inference for Categorical Data in R