Exploratory Data Analysis in SQL
Christina Maimone
Data Scientist
SELECT * FROM naics;
id | category | businesses
--------+---------------------------------------------+------------
111110 | Agriculture: Soybean Farming | 4788
111130 | Agriculture | Dry Pea and Bean Farming | 3606
111140 | Agriculture: Wheat Farming | 6393
111150 | Agriculture - Corn Farming | 26469
111160 | Agriculture: Rice Farming | 949
111199 | Agriculture - All Other Grain Farming | 15035
111211 | Agriculture | Potato Farming | 617
611110 | Education - Elementary and Secondary | 187859
611210 | Education | Junior Colleges | 3961
611310 | Education: Colleges and Universities | 29148
-- Case for each of :, -, and |
SELECT CASE WHEN category LIKE '%: %' THEN split_part(category, ': ', 1)
WHEN category LIKE '% - %' THEN split_part(category, ' - ', 1)
ELSE split_part(category, ' | ', 1)
END AS major_category, -- alias the result
sum(businesses) -- also select number of businesses FROM naics GROUP BY major_category; -- Group by categories created above
major_category | sum
----------------+--------
Education | 220968
Agriculture | 57857
Original values: fruit
table
customer | fav_fruit
----------+-----------
349 | apple
874 | Apple
703 | apple
667 | bannana
622 | banana
387 | BANANA
300 | APPLES
313 | apple
499 | banana
418 | apple
841 | BANANA
300 | APPLE
754 | apple
Standardized values: recode
table
original | standardized
----------+--------------
APPLES | apple
apple | apple
Apple | apple
bannana | banana
apple | apple
banana | banana
banana | banana
APPLE | apple
apple | apple
BANANA | banana
CREATE TEMP TABLE recode AS
SELECT DISTINCT fav_fruit AS original, -- original, messy values
fav_fruit AS standardized -- new standardized values
FROM fruit;
SELECT *
FROM recode;
original | standardized
----------+--------------
APPLES | APPLES
apple | apple
Apple | Apple
bannana | bannana
apple | apple
banana | banana
banana | banana
APPLE | APPLE
apple | apple
BANANA | BANANA
(10 rows)
UPDATE table_name
SET column_name = new_value
WHERE condition;
-- All rows: lower case, remove white space on ends
UPDATE recode
SET standardized=trim(lower(original));
-- Specific rows: correct a misspelling
UPDATE recode
SET standardized='banana'
WHERE standardized LIKE '%nn%';
-- All rows: remove any s
UPDATE recode
SET standardized=rtrim(standardized, 's');
SELECT *
FROM recode;
original | standardized
----------+--------------
APPLES | apple
apple | apple
Apple | apple
apple | apple
banana | banana
banana | banana
APPLE | apple
apple | apple
BANANA | banana
bannana | banana
(10 rows)
Original only
SELECT fav_fruit, count(*)
FROM fruit
GROUP BY fav_fruit;
fav_fruit | count
-----------+-------
APPLES | 1
apple | 1
apple | 3
banana | 1
BANANA | 2
apple | 1
APPLE | 1
bannana | 1
banana | 1
Apple | 1
(10 rows)
With recoded values
SELECT standardized,
count(*)
FROM fruit
LEFT JOIN recode
ON fav_fruit=original
GROUP BY standardized;
standardized | count
--------------+-------
apple | 8
banana | 5
(2 rows)
CREATE TEMP TABLE
with original valuesUPDATE
to create standardized valuesJOIN
original data to standardized dataExploratory Data Analysis in SQL