Analyzing US Census Data in Python
Lee Hachadoorian
Asst. Professor of Instruction, Temple University
Source: United States Census Bureau
import requests
# Build base URL
HOST = "https://api.census.gov/data"
year = "2012"
dataset = "acs/acs5"
base_url = "/".join([HOST, year, dataset])
# Specify requested variables
# B01001_001E = Total population (estimate)
# B03002_003E = Nonhispanic White population (estimate)
# B03002_004E = Nonhispanic Black population (estimate)
get_vars = ["NAME", "B01001_001E", "B03002_003E", "B03002_004E"]
# Specify requested variables get_vars = ["NAME", "B01001_001E", "B03002_003E", "B03002_004E"] # Create dictionary of predicates predicates = {} predicates["get"] = ",".join(get_vars)
# Requested geography predicates["for"] = \ "metropolitan statistical area/micropolitan statistical area:*"
r = requests.get(base_url, params=predicates)
print(r.json()[:5])
[['NAME', 'B01001_001E', 'B03002_003E', 'B03002_004E', 'metropolitan statistical area/micropolitan statistical area'],
['Adjuntas, PR Micro Area', '19458', '140', '0', '10260'],
['Aguadilla-Isabela-San Sebastián, PR Metro Area', '305538', '5602', '231', '10380'],
['Coamo, PR Micro Area', '71596', '228', '53', '17620'],
['Fajardo, PR Metro Area', '70633', '543', '195', '21940']]
# Create user-friendly column names
col_names = ["name", "pop", "white", "black", "msa"]
# Load JSON response into DataFrame
msa = pd.DataFrame(columns=col_names, data=r.json()[1:])
# Cast count columns to int data type
msa[["pop", "white", "black"]] = msa["pop", "white", "black"]].astype(int)
state county tract white black
0 01 001 020100 1601 217
1 01 001 020200 844 1214
2 01 001 020300 2538 647
3 01 001 020400 4030 191
4 01 001 020500 8438 1418
msa msa_name county_name state_name state county
0 10100 Aberdeen, SD Brown County South Dakota 46 013
1 10100 Aberdeen, SD Edmunds County South Dakota 46 045
2 10140 Aberdeen, WA Grays Harbor County Washington 53 027
3 10180 Abilene, TX Callahan County Texas 48 059
4 10180 Abilene, TX Jones County Texas 48 253
import pandas as pd
# Join DataFrames on matching columns
tracts_with_msa_id = pd.merge(...)
import pandas as pd
# Join DataFrames on matching columns
tracts_with_msa_id = pd.merge(tracts, msa_def, ...)
import pandas as pd
# Join DataFrames on matching columns
tracts_with_msa_id = pd.merge(tracts, msa_def,
left_on = ["state", "county"], right_on = ["state", "county"])
# Alternative when column names are the same
tracts_with_msa_id = pd.merge(tracts, msa_def, on = ["state", "county"])
# DataFrame with state names
st.head()
state_name
state
01 Alabama
02 Alaska
04 Arizona
05 Arkansas
06 California
# Join tracts and st DataFrames tracts_st = pd.merge(tracts, st, left_on = "state", right_index = True)
tracts_st.head()
state county tract white black state_name
0 01 001 020100 1601 217 Alabama
1 01 001 020200 844 1214 Alabama
2 01 001 020300 2538 647 Alabama
3 01 001 020400 4030 191 Alabama
Analyzing US Census Data in Python