Cleaning Data in SQL Server Databases
Miriam Antona
Software Engineer
| airport_state |
|---------------|
| Caalifornia |
| California |
| Californiaa |
| Illinois |
| Ilynois |
| Tejas |
| Texas |
SOUNDEX
DIFFERENCE
SOUNDEX ( character_expression )
SELECT SOUNDEX('Illinois') AS soundex_code1;
SELECT SOUNDEX('Ilynois') AS soundex_code2;
SELECT SOUNDEX('California') AS soundex_code3;
| soundex_code1 | | soundex_code2 | | soundex_code3 |
|---------------| |---------------| |---------------|
| I452 | | I452 | | I416 |
"Illinois" -> I
"Illinois" -> Ill0n00s
Letters | Represented by |
---|---|
b, f, p, v | 1 |
c, g, j, k, q, s, x, z | 2 |
d, t | 3 |
l | 4 |
m, n | 5 |
r | 6 |
"Ill0n00s" -> I4405002
"I4405002" -> I40502
"I40502" -> I452
"I452" (don't apply)
SELECT SOUNDEX('Arizona') AS soundex_code1;
SELECT SOUNDEX('Arkansas') AS soundex_code2;
| soundex_code1 | | soundex_code2 |
|---------------| |---------------|
| A625 | | A625 |
SELECT DISTINCT A1.airport_state
FROM airports A1
INNER JOIN airports A2
ON SOUNDEX(A1.airport_state) = SOUNDEX(A2.airport_state)
AND A1.airport_state <> A2.airport_state
| airport_state |
|---------------|
| Caalifornia |
| California |
| Californiaa |
| Illinois |
| Ilynois |
| New Jersey |
| New York |
| Tejas |
| Texas |
SELECT DISTINCT A1.airport_state
FROM airports A1
INNER JOIN airports A2
ON SOUNDEX(REPLACE(A1.airport_state, ' ', '')) = SOUNDEX(REPLACE(A2.airport_state, ' ', ''))
AND A1.airport_state <> A2.airport_state
"New York" -> "NewYork"
| airport_state |
|---------------|
| Caalifornia |
| California |
| Californiaa |
| Illinois |
| Ilynois |
| Tejas |
| Texas |
DIFFERENCE ( character_expression , character_expression )
SELECT DIFFERENCE('Illinois', 'Ilynois') AS dif_1;
| dif1 |
|------|
| 4 |
SELECT DIFFERENCE('Illinois', 'California') AS dif_2;
| dif2 |
|------|
| 1 |
SELECT DISTINCT A1.airport_state, A2.airport_state
FROM airports A1
INNER JOIN airports A2
ON DIFFERENCE(REPLACE(A1.airport_state, ' ', ''), REPLACE(A2.airport_state, ' ', '')) = 4
AND A1.airport_state <> A2.airport_state
| airport_state | airport_state |
|---------------|---------------|
| Caalifornia | California |
| Caalifornia | Californiaa |
| California | Caalifornia |
| California | Californiaa |
| Californiaa | Caalifornia |
| Californiaa | California |
| Illinois | Ilynois |
| Ilynois | Illinois |
| Massachusetts | Michigan |
| Tejas | Texas |
| Texas | Tejas |
Cleaning Data in SQL Server Databases