Exploratory Data Analysis in SQL
Christina Maimone
Data Scientist
SELECT unanswered_count, count(*)
FROM stackoverflow
WHERE tag='amazon-ebs'
GROUP BY unanswered_count
ORDER BY unanswered_count;
unanswered_count | count
------------------+-------
37 | 12
38 | 40
...
43 | 10
44 | 8
45 | 17
46 | 4
47 | 1
...
54 | 131
55 | 34
56 | 1
(20 rows)
SELECT trunc(42.1256, 2);
42.12
SELECT trunc(12345, -3);
12000
SELECT trunc(unanswered_count, -1) AS trunc_ua,
count(*)
FROM stackoverflow
WHERE tag='amazon-ebs'
GROUP BY trunc_ua -- column alias
ORDER BY trunc_ua; -- column alias
trunc_ua | count
----------+-------
30 | 74
40 | 194
50 | 480
(3 rows)
SELECT generate_series(start, end, step);
SELECT generate_series(1, 10, 2);
generate_series
-----------------
1
3
5
7
9
(5 rows)
SELECT generate_series(0, 1, .1);
generate_series
-----------------
0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
(11 rows)
lower | upper | count
-------+-------+-------
30 | 35 | 0
35 | 40 | 74
40 | 45 | 155
45 | 50 | 39
50 | 55 | 445
55 | 60 | 35
60 | 65 | 0
(7 rows)
-- Create bins
WITH bins AS (
SELECT generate_series(30,60,5) AS lower,
generate_series(35,65,5) AS upper),
;
-- Create bins
WITH bins AS (
SELECT generate_series(30,60,5) AS lower,
generate_series(35,65,5) AS upper),
-- Subset data to tag of interest
ebs AS (
SELECT unanswered_count
FROM stackoverflow
WHERE tag='amazon-ebs')
;
-- Create bins
WITH bins AS (
SELECT generate_series(30,60,5) AS lower,
generate_series(35,65,5) AS upper),
-- Subset data to tag of interest
ebs AS (
SELECT unanswered_count
FROM stackoverflow
WHERE tag='amazon-ebs')
-- Count values in each bin
SELECT lower, upper, count(unanswered_count)
-- left join keeps all bins
FROM bins
LEFT JOIN ebs
ON unanswered_count >= lower
AND unanswered_count < upper
;
-- Create bins
WITH bins AS (
SELECT generate_series(30,60,5) AS lower,
generate_series(35,65,5) AS upper),
-- Subset data to tag of interest
ebs AS (
SELECT unanswered_count
FROM stackoverflow
WHERE tag='amazon-ebs')
-- Count values in each bin
SELECT lower, upper, count(unanswered_count)
-- left join keeps all bins
FROM bins
LEFT JOIN ebs
ON unanswered_count >= lower
AND unanswered_count < upper
-- Group by bin bounds to create the groups
GROUP BY lower, upper
ORDER BY lower;
lower | upper | count
-------+-------+-------
30 | 35 | 0
35 | 40 | 74
40 | 45 | 155
45 | 50 | 39
50 | 55 | 445
55 | 60 | 35
60 | 65 | 0
(7 rows)
Exploratory Data Analysis in SQL