Cleaning Data in SQL Server Databases
Miriam Antona
Software Engineer
| registration_code | airport_code | carrier_code | canceled | on_time | delayed | diverted |
|-------------------|--------------|--------------|----------|---------|---------|----------|
| ... | ... | ... | ... | ... | ... | ... |
| 000000119 | JFK | AA | 74 | 819 | 233 | 13 |
| 000000130 | JFK | HA | NULL | NULL | NULL | NULL |
| 000000131 | JFK | HA | NULL | NULL | NULL | NULL |
| 000000132 | MSP | YV | 0 | 6 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| registration_code | airport_code | carrier_code | canceled | on_time | delayed | diverted |
|-------------------|--------------|--------------|----------|---------|---------|----------|
| ... | ... | ... | ... | ... | ... | ... |
| 000000119 | JFK | AA | 74 | 819 | 233 | 13 |
| 000000130 | JFK | HA | | | | |
| 000000131 | JFK | HA | | | | |
| 000000132 | MSP | YV | 0 | 6 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| airport_code | airport_name | airport_city | airport_state |
|--------------|-----------------------------------------|-------------------|---------------|
| ... | ... | ... | ... |
| DFW | Dallas/Fort Worth International | Dallas/Fort Worth | Texas |
| BOS | Logan International | Boston | Massachusetts |
| SEA | Seattle/Tacoma International | NULL | NULL |
| PHL | Philadelphia International | Philadelphia | NULL |
| SLC | Salt Lake City International | NULL | Utah |
| MCO | Orlando International | Orlando | Florida |
| TPA | Tampa International | Tampa | Fl |
| FLL | Fort Lauderdale-Hollywood International | Fort Lauderdale | FL |
| ... | ... | ... | ... |
SELECT * FROM airports
WHERE airport_state IS NOT NULL
| airport_code | airport_name | airport_city | airport_state |
|--------------|-----------------------------------------|-------------------|---------------|
| ... | ... | ... | ... |
| DFW | Dallas/Fort Worth International | Dallas/Fort Worth | Texas |
| BOS | Logan International | Boston | Massachusetts |
| SLC | Salt Lake City International | NULL | Utah |
| MCO | Orlando International | Orlando | Florida |
| TPA | Tampa International | Tampa | Fl |
| FLL | Fort Lauderdale-Hollywood International | Fort Lauderdale | FL |
| ... | ... | ... | ... |
SELECT * FROM airports
WHERE airport_state IS NULL
| airport_code | airport_name | airport_city | airport_state |
|--------------|------------------------------|--------------|---------------|
| SEA | Seattle/Tacoma International | NULL | NULL |
| PHL | Philadelphia International | Philadelphia | NULL |
SELECT * FROM airports
| airport_code | airport_name | airport_city | airport_state |
|--------------|-----------------------------------------|-------------------|---------------|
| ... | ... | ... | ... |
| DFW | Dallas/Fort Worth International | Dallas/Fort Worth | Texas |
| BOS | Logan International | Boston | Massachusetts |
| SEA | Seattle/Tacoma International | NULL | |
| PHL | Philadelphia International | Philadelphia | |
| SLC | Salt Lake City International | NULL | Utah |
| MCO | Orlando International | Orlando | Florida |
| TPA | Tampa International | Tampa | Fl |
| FLL | Fort Lauderdale-Hollywood International | Fort Lauderdale | FL |
| ... | ... | ... | ... |
SELECT * FROM airports
WHERE airport_state <> ''
| airport_code | airport_name | airport_city | airport_state |
|--------------|-----------------------------------------|-------------------|---------------|
| ... | ... | ... | ... |
| DFW | Dallas/Fort Worth International | Dallas/Fort Worth | Texas |
| BOS | Logan International | Boston | Massachusetts |
| SLC | Salt Lake City International | NULL | Utah |
| MCO | Orlando International | Orlando | Florida |
| TPA | Tampa International | Tampa | Fl |
| FLL | Fort Lauderdale-Hollywood International | Fort Lauderdale | FL |
| ... | ... | ... | ... |
ISNULL ( check_expression , replacement_value )
SELECT
airport_code,
airport_name,
airport_city,
ISNULL(airport_state, 'Unknown') AS airport_state
FROM airports
| airport_code | airport_name | airport_city | airport_state |
|--------------|-----------------------------------------|-----------------|---------------|
| ... | ... | ... | ... |
| SEA | Seattle/Tacoma International | NULL | Unknown |
| PHL | Philadelphia International | Philadelphia | Unknown |
| SLC | Salt Lake City International | NULL | Utah |
| MCO | Orlando International | Orlando | Florida |
| TPA | Tampa International | Tampa | Fl |
| FLL | Fort Lauderdale-Hollywood International | Fort Lauderdale | FL |
| ... | ... | ... | ... |
| registration_code | airport_code | carrier_code | canceled |
|-------------------|--------------|--------------|----------|
| ... | ... | ... | ... |
| 000000128 | JFK | B6 | 181 |
| 000000129 | JFK | EV | 18 |
| 000000130 | JFK | HA | NULL |
| 000000131 | JFK | HA | NULL |
| 000000132 | MSP | YV | 0 |
| 000000133 | MSP | AA | 15 |
| ... | ... | ... | ... |
NULL
with the average.SELECT registration_code, airport_code, carrier_code,
ISNULL(canceled, (SELECT AVG(canceled) FROM flight_statistics)) AS canceled_fixed
FROM flight_statistics
GROUP BY registration_code, airport_code, carrier_code, canceled
| registration_code | airport_code | carrier_code | canceled |
|-------------------|--------------|--------------|----------|
| ... | ... | ... | ... |
| 000000128 | JFK | B6 | 181 |
| 000000129 | JFK | EV | 18 |
| 000000130 | JFK | HA | 65 |
| 000000131 | JFK | HA | 65 |
| 000000132 | MSP | YV | 0 |
| 000000133 | MSP | AA | 15 |
| ... | ... | ... | ... |
COALESCE ( arg1, arg2, arg3, ... )
SELECT
airport_code,
airport_city,
airport_state,
COALESCE (airport_state, airport_city, 'Unknown') AS airport_state_fixed
FROM airports
| airport_code | airport_city | airport_state | airport_state_fixed |
|--------------|--------------|---------------|---------------------|
| ... | ... | ... | ... |
| SLC | NULL | Utah | Utah |
| PHL | Philadelphia | NULL | Philadelphia |
| SEA | NULL | NULL | Unknown |
| ... | ... | ... | ... |
Cleaning Data in SQL Server Databases