Exploratory Data Analysis in SQL
Christina Maimone
Data Scientist
SELECT generate_series(from, to, interval);
SELECT generate_series('2018-01-01',
'2018-01-15',
'2 days'::interval);
generate_series
---------------------
2018-01-01 00:00:00
2018-01-03 00:00:00
2018-01-05 00:00:00
2018-01-07 00:00:00
2018-01-09 00:00:00
2018-01-11 00:00:00
2018-01-13 00:00:00
2018-01-15 00:00:00
(8 rows)
SELECT generate_series('2018-01-01',
'2018-01-02',
'5 hours'::interval);
generate_series
----------------------
2018-01-01 00:00:00
2018-01-01 05:00:00
2018-01-01 10:00:00
2018-01-01 15:00:00
2018-01-01 20:00:00
(5 rows)
SELECT generate_series('2018-01-31',
'2018-12-31',
'1 month'::interval);
generate_series
---------------------
2018-01-31 00:00:00
2018-02-28 00:00:00
2018-03-28 00:00:00
2018-04-28 00:00:00
2018-05-28 00:00:00
2018-06-28 00:00:00
2018-07-28 00:00:00
2018-08-28 00:00:00
2018-09-28 00:00:00
2018-10-28 00:00:00
2018-11-28 00:00:00
2018-12-28 00:00:00
(12 rows)
-- Subtract 1 day to get end of month
SELECT generate_series('2018-02-01', -- start 1 month late
'2019-01-01',
'1 month'::interval) - '1 day'::interval;
generate_series
---------------------
2018-01-31 00:00:00
2018-02-28 00:00:00
2018-03-31 00:00:00
2018-04-30 00:00:00
2018-05-31 00:00:00
2018-06-30 00:00:00
2018-07-31 00:00:00
2018-08-31 00:00:00
2018-09-30 00:00:00
2018-10-31 00:00:00
2018-11-30 00:00:00
2018-12-31 00:00:00
(12 rows)
SELECT * FROM sales;
date | amount
---------------------+--------
2018-04-23 09:13:14 | 12
2018-04-23 13:57:53 | 41
2018-04-23 12:05:44 | 23
2018-04-23 09:07:33 | 31
2018-04-23 10:31:40 | 5
2018-04-23 09:35:16 | 18
2018-04-23 12:17:43 | 19
2018-04-23 12:57:49 | 32
2018-04-23 10:12:35 | 13
2018-04-23 13:21:30 | 6
(10 rows)
SELECT date_trunc('hour', date)
AS hour,
count(*)
FROM sales
GROUP BY hour
ORDER BY hour;
hour | count
---------------------+-------
2018-04-23 09:00:00 | 3
2018-04-23 10:00:00 | 2
2018-04-23 12:00:00 | 3
2018-04-23 13:00:00 | 2
(4 rows)
-- Create the series as a table called hour_series WITH hour_series AS ( SELECT generate_series('2018-04-23 09:00:00', -- 9am '2018-04-23 14:00:00', -- 2pm '1 hour'::interval) AS hours)
-- Hours from series, count date (NOT *) to count non-NULL SELECT hours, count(date) -- Join series to sales data FROM hour_series LEFT JOIN sales ON hours=date_trunc('hour', date) GROUP BY hours ORDER BY hours;
hours | count
------------------------+-------
2018-04-23 09:00:00-05 | 3
2018-04-23 10:00:00-05 | 2
2018-04-23 11:00:00-05 | 0
2018-04-23 12:00:00-05 | 3
2018-04-23 13:00:00-05 | 2
2018-04-23 14:00:00-05 | 0
(6 rows)
-- Create bins WITH bins AS ( SELECT generate_series('2018-04-23 09:00:00', '2018-04-23 15:00:00', '3 hours'::interval) AS lower, generate_series('2018-04-23 12:00:00', '2018-04-23 18:00:00', '3 hours'::interval) AS upper)
-- Count values in each bin SELECT lower, upper, count(date) -- left join keeps all bins FROM bins LEFT JOIN sales ON date >= lower AND date < upper -- Group by bin bounds to create the groups GROUP BY lower, upper ORDER BY lower;
lower | upper | count
---------------------+---------------------+-------
2018-04-23 09:00:00 | 2018-04-23 12:00:00 | 5
2018-04-23 12:00:00 | 2018-04-23 15:00:00 | 5
2018-04-23 15:00:00 | 2018-04-23 18:00:00 | 0
(3 rows)
Exploratory Data Analysis in SQL