Exploring distributions

Exploratory Data Analysis in SQL

Christina Maimone

Data Scientist

Count values

SELECT unanswered_count, count(*)
  FROM stackoverflow
 WHERE tag='amazon-ebs'
 GROUP BY unanswered_count
 ORDER BY unanswered_count;
 unanswered_count | count 
------------------+-------
               37 |    12
               38 |    40
...
               43 |    10
               44 |     8
               45 |    17
               46 |     4
               47 |     1
...
               54 |   131
               55 |    34
               56 |     1
(20 rows)
Exploratory Data Analysis in SQL

Truncate

SELECT trunc(42.1256, 2);
42.12
SELECT trunc(12345, -3);
12000
Exploratory Data Analysis in SQL

Truncating and grouping

SELECT trunc(unanswered_count, -1) AS trunc_ua, 
       count(*)
  FROM stackoverflow
 WHERE tag='amazon-ebs'
 GROUP BY trunc_ua   -- column alias
 ORDER BY trunc_ua;  -- column alias
 trunc_ua | count 
----------+-------
       30 |    74
       40 |   194
       50 |   480
(3 rows)
Exploratory Data Analysis in SQL

Generate series

SELECT generate_series(start, end, step);
Exploratory Data Analysis in SQL

Generate series

SELECT generate_series(1, 10, 2);
 generate_series 
-----------------
               1
               3
               5
               7
               9
(5 rows)
SELECT generate_series(0, 1, .1);
 generate_series 
-----------------
               0
             0.1
             0.2
             0.3
             0.4
             0.5
             0.6
             0.7
             0.8
             0.9
             1.0
(11 rows)
Exploratory Data Analysis in SQL

Create bins: output

 lower | upper | count 
-------+-------+-------
    30 |    35 |     0
    35 |    40 |    74
    40 |    45 |   155
    45 |    50 |    39
    50 |    55 |   445
    55 |    60 |    35
    60 |    65 |     0
(7 rows)
Exploratory Data Analysis in SQL

Create bins: query

-- Create bins
WITH bins AS (
      SELECT generate_series(30,60,5) AS lower,
             generate_series(35,65,5) AS upper), 














               ; 
Exploratory Data Analysis in SQL

Create bins: query

-- Create bins
WITH bins AS (
      SELECT generate_series(30,60,5) AS lower,
             generate_series(35,65,5) AS upper), 
     -- Subset data to tag of interest
     ebs AS (
      SELECT unanswered_count
        FROM stackoverflow
       WHERE tag='amazon-ebs')









               ; 
Exploratory Data Analysis in SQL

Create bins: query

-- Create bins
WITH bins AS (
      SELECT generate_series(30,60,5) AS lower,
             generate_series(35,65,5) AS upper), 
     -- Subset data to tag of interest
     ebs AS (
      SELECT unanswered_count
        FROM stackoverflow
       WHERE tag='amazon-ebs')
-- Count values in each bin
SELECT lower, upper, count(unanswered_count) 
  -- left join keeps all bins
  FROM bins
       LEFT JOIN ebs
              ON unanswered_count >= lower
             AND unanswered_count < upper


               ; 
Exploratory Data Analysis in SQL

Create bins: query

-- Create bins
WITH bins AS (
      SELECT generate_series(30,60,5) AS lower,
             generate_series(35,65,5) AS upper), 
     -- Subset data to tag of interest
     ebs AS (
      SELECT unanswered_count
        FROM stackoverflow
       WHERE tag='amazon-ebs')
-- Count values in each bin
SELECT lower, upper, count(unanswered_count) 
  -- left join keeps all bins
  FROM bins
       LEFT JOIN ebs
              ON unanswered_count >= lower
             AND unanswered_count < upper
 -- Group by bin bounds to create the groups
 GROUP BY lower, upper
 ORDER BY lower;
Exploratory Data Analysis in SQL

Create bins: output

 lower | upper | count 
-------+-------+-------
    30 |    35 |     0
    35 |    40 |    74
    40 |    45 |   155
    45 |    50 |    39
    50 |    55 |   445
    55 |    60 |    35
    60 |    65 |     0
(7 rows)
Exploratory Data Analysis in SQL

Time to explore some distributions!

Exploratory Data Analysis in SQL

Preparing Video For Download...