Cleaning Data in PostgreSQL Databases
Darryl Reeves, Ph.D.
Industry Assistant Professor, New York University
camis | name | score | inspection_type | ...
---------+---------------------------------+-------+---------------------------------------+-----
... | ... | ... | ... | ...
41659848 | LA BRISA DEL CIBAO | 20 | Cycle Inspection / Initial Inspection | ...
40961447 | MESON SEVILLA RESTAURANT | 50 | Cycle Inspection / Initial Inspection | ...
50063071 | WA BAR | 15 | Cycle Inspection / Initial Inspection | ...
50095871 | ALPHONSO'S PIZZERIA & TRATTORIA | 10 | Cycle Inspection / Initial Inspection | ...
41104041 | THE SPARROW TAVERN | 13 | Cycle Inspection / Initial Inspection | ...
... | ... | ... | ... | ...
SELECT
column_name,
data_type
FROM
information_schema.columns
WHERE
table_name = 'restaurant_inspection';
column_name | data_type
-----------------+-----------
camis | bigint
name | text
boro | text
building | text
street | text
zip_code | smallint
... | ...
SELECT
column_name,
data_type
FROM
information_schema.columns
WHERE
table_name = 'restaurant_inspection' AND
column_name = 'camis';
column_name | data_type
------------+-----------
camis | bigint
CASE
clauseA
, B
, and C
A
= 3, B
= 2, C
= 1SELECT
boro,
AVG(grade_points)
FROM (
SELECT
*,
CASE
WHEN grade = 'A' then 3
WHEN grade = 'B' then 2
WHEN grade = 'C' then 1
END AS grade_points
FROM
restaurant_inspection
) sub
GROUP BY boro;
SELECT
boro,
AVG(grade_points)
FROM (
SELECT
*,
CASE
WHEN grade = 'A' then 3
WHEN grade = 'B' then 2
WHEN grade = 'C' then 1
END AS grade_points
FROM
restaurant_inspection
) sub
GROUP BY boro;
boro | avg
--------------+--------------------
Brooklyn | 2.7641196013289037
Bronx | 2.7685589519650655
Manhattan | 2.7678381256656017
Queens | 2.7803571428571429
Staten Island | 2.8068181818181818
camis | diff
---------+------
... | ...
50080214 | 87
50059239 | 74
50086316 | 74
41637438 | 71
41667902 | 64
50067622 | 61
50017111 | 60
50017056 | 60
50045240 | 59
50002403 | 59
... | ...
SELECT
camis,
MAX(score) - MIN(score) AS diff
FROM
restaurant_inspection
WHERE
score IS NOT NULL
GROUP BY
camis
ORDER BY
diff DESC;
CAST( value AS type)
SELECT
camis,
MAX(CAST(score AS int)) - MIN(CAST(score AS int)) AS diff
FROM
restaurant_inspection
WHERE
score IS NOT NULL
GROUP BY
camis
ORDER BY
diff DESC
value::type
SELECT
camis,
MAX(score::int) - MIN(score::int) AS diff
FROM
restaurant_inspection
WHERE
score IS NOT NULL
GROUP BY
camis
ORDER BY
diff DESC
Cleaning Data in PostgreSQL Databases