Cleaning Data in PostgreSQL Databases
Darryl Reeves, Ph.D.
Industry Assistant Professor, New York University
camis | name | inspection_type | ...
---------+-----------------------+----------------------------------------------+-----
... | ... | ... | ...
50084922 | JUICE POINT | Cycle Inspection / Re-inspection | ...
50075375 | ATOMIC WINGS | Administrative Miscellaneous / Re-inspection | ...
50048685 | KENNEDY FRIED CHICKEN | Cycle Inspection / Re-inspection | ...
50058910 | HUNGER PANG | Pre-permit (Operational) / Re-inspection | ...
50047834 | SUBWAY | Smoke-Free Air Act / Re-inspection | ...
... | ... | ... | ...
Value delimiter: ' / '
sub_inspection_type | count
-----------------------------+-------
Reopening Inspection | 56
Re-inspection | 1333
Initial Inspection | 3488
Second Compliance Inspection | 2
Compliance Inspection | 27
camis | name | inspection_type | ...
---------+-----------------------+----------------------------------------------+-----
... | ... | ... | ...
50084922 | JUICE POINT | Cycle Inspection / Re-inspection | ...
50075375 | ATOMIC WINGS | Administrative Miscellaneous / Re-inspection | ...
50048685 | KENNEDY FRIED CHICKEN | Cycle Inspection / Re-inspection | ...
50058910 | HUNGER PANG | Pre-permit (Operational) / Re-inspection | ...
50047834 | SUBWAY | Smoke-Free Air Act / Re-inspection | ...
... | ... | ... | ...
camis | name | main_inspection_type | sub_inspection_type | ...
---------+-----------------------+------------------------------+---------------------+-----
... | ... | ... | ... | ...
50084922 | JUICE POINT | Cycle Inspection | Re-inspection | ...
50075375 | ATOMIC WINGS | Administrative Miscellaneous | Re-inspection | ...
50048685 | KENNEDY FRIED CHICKEN | Cycle Inspection | Re-inspection | ...
50058910 | HUNGER PANG | Pre-permit (Operational) | Re-inspection | ...
50047834 | SUBWAY | Smoke-Free Air Act | Re-inspection | ...
... | ... | ... | ... | ...
SPLIT_PART(source_string, delimiter_string, part_number)
SELECT
SPLIT_PART('Cycle Inspection / Re-inspection', ' / ', 1);
Cycle Inspection
SELECT
SPLIT_PART('Cycle Inspection / Re-inspection', ' / ', 2);
Re-inspection
SELECT
camis,
name,
SPLIT_PART(inspection_type, ' / ', 1) AS main_inspection_type,
SPLIT_PART(inspection_type, ' / ', 2) AS sub_inspection_type
FROM
restaurant_inspection;
camis | name | main_inspection_type | sub_inspection_type | ...
---------+-----------------------+------------------------------+---------------------+-----
... | ... | ... | ... | ...
50084922 | JUICE POINT | Cycle Inspection | Re-inspection | ...
50075375 | ATOMIC WINGS | Administrative Miscellaneous | Re-inspection | ...
50048685 | KENNEDY FRIED CHICKEN | Cycle Inspection | Re-inspection | ...
50058910 | HUNGER PANG | Pre-permit (Operational) | Re-inspection | ...
50047834 | SUBWAY | Smoke-Free Air Act | Re-inspection | ...
... | ... | ... | ... | ...
camis | name | cuisine_description | ...
---------+-----------------------+---------------------+-----
... | ... | ... | ...
50066768 | FIRST LAMB SHABU | Chinese | ...
41450971 | GIOVANNI'S RESTAURANT | Pizza/Italian | ...
41628459 | KFC | Chicken | ...
50043003 | BANGIA | Korean | ...
41418978 | BAGEL EXPRESS III | Bagels/Pretzels | ...
... | ... | ... | ...
camis | name | cuisine_description | ...
-------+-----------------------+---------------------+-----
... | ... | ... | ...
50066768 | FIRST LAMB SHABU | Chinese | ...
41450971 | GIOVANNI'S RESTAURANT | Pizza | ...
41450971 | GIOVANNI'S RESTAURANT | Italian | ...
41628459 | KFC | Chicken | ...
50043003 | BANGIA | Korean | ...
41418978 | BAGEL EXPRESS III | Bagels | ...
41418978 | BAGEL EXPRESS III | Pretzels | ...
... | ... | ... | ...
REGEXP_SPLIT_TO_TABLE(source, pattern)
SELECT REGEXP_SPLIT_TO_TABLE('Pizza/Italian', '/');
Pizza
Italian
SELECT
camis,
name,
REGEXP_SPLIT_TO_TABLE(cuisine_description, '/') AS cuisine_description,
...
FROM
restaurant_inspection;
camis | name | cuisine_description | ...
---------+-----------------------+---------------------+-----
... | ... | ... | ...
50066768 | FIRST LAMB SHABU | Chinese | ...
41450971 | GIOVANNI'S RESTAURANT | Pizza | ...
41450971 | GIOVANNI'S RESTAURANT | Italian | ...
41628459 | KFC | Chicken | ...
50043003 | BANGIA | Korean | ...
41418978 | BAGEL EXPRESS III | Bagels | ...
41418978 | BAGEL EXPRESS III | Pretzels | ...
... | ... | ... | ...
cuisine_num | camis | name | cuisine_description | ...
------------+----------+-----------------------+---------------------+-----
... | ... | ... | ... | ...
1 | 41418978 | BAGEL EXPRESS III | Bagels | ...
2 | 41418978 | BAGEL EXPRESS III | Pretzels | ...
1 | 41450971 | GIOVANNI'S RESTAURANT | Pizza | ...
2 | 41450971 | GIOVANNI'S RESTAURANT | Italian | ...
1 | 41628459 | KFC | Chicken | ...
1 | 50043003 | BANGIA | Korean | ...
1 | 50066768 | FIRST LAMB SHABU | Chinese | ...
... | ... | ... | ... | ...
ROW_NUMBER() OVER()
PARTITION BY col1, col2, ...
ORDER BY colA, colB, ...
SELECT
ROW_NUMBER() OVER (
PARTITION BY
-- group columns for numbering
camis,
name
ORDER BY
-- set ordering of results
camis,
name
) AS cuisine_num,
*
FROM (
SELECT
camis,
name,
REGEXP_SPLIT_TO_TABLE(cuisine_description, '/')
AS cuisine_description
FROM
restaurant_inspection;
cuisine_num | camis | name | cuisine_description | ...
------------+----------+-----------------------+---------------------+-----
... | ... | ... | ... | ...
1 | 41418978 | BAGEL EXPRESS III | Bagels | ...
2 | 41418978 | BAGEL EXPRESS III | Pretzels | ...
1 | 41450971 | GIOVANNI'S RESTAURANT | Pizza | ...
2 | 41450971 | GIOVANNI'S RESTAURANT | Italian | ...
1 | 41628459 | KFC | Chicken | ...
1 | 50043003 | BANGIA | Korean | ...
1 | 50066768 | FIRST LAMB SHABU | Chinese | ...
... | ... | ... | ... | ...
Cleaning Data in PostgreSQL Databases