Membersihkan Data di Python
Adel Nehme
VP of AI Curriculum, DataCamp

Dapat berupa NA, nan, 0, . ...
Kesalahan teknis
Kesalahan manusia
import pandas as pd
airquality = pd.read_csv('airquality.csv')
print(airquality)
Date Temperature CO2
987 20/04/2004 16.8 0.0
2119 07/06/2004 18.7 0.8
2451 20/06/2004 -40.0 NaN
1984 01/06/2004 19.6 1.8
8299 19/02/2005 11.2 1.2
... ... ... ...
import pandas as pd
airquality = pd.read_csv('airquality.csv')
print(airquality)
Date Temperature CO2
987 20/04/2004 16.8 0.0
2119 07/06/2004 18.7 0.8
2451 20/06/2004 -40.0 NaN <--
1984 01/06/2004 19.6 1.8
8299 19/02/2005 11.2 1.2
... ... ... ...
# Kembalikan nilai hilang
airquality.isna()
Date Temperature CO2
987 False False False
2119 False False False
2451 False False True
1984 False False False
8299 False False False
# Ringkasan missingness
airquality.isna().sum()
Date 0
Temperature 0
CO2 366
dtype: int64
Paket untuk memvisualisasikan dan memahami data hilang
import missingno as msno import matplotlib.pyplot as plt# Visualisasikan missingness msno.matrix(airquality) plt.show()

# Pisahkan nilai hilang dan lengkap
airquality_missing = airquality[airquality['CO2'].isna()]
airquality_complete = airquality[~airquality['CO2'].isna()]
# Deskripsikan DataFrame lengkap
complete.describe()
Temperature CO2
count 8991.000000 8991.000000
mean 18.317829 1.739584
std 8.832116 1.537580
min -1.900000 0.000000
... ... ...
max 44.600000 11.900000
# Deskripsikan DataFrame yang hilang
missing.describe()
Temperature CO2
count 366.000000 0.0
mean -39.655738 NaN
std 5.988716 NaN
min -49.000000 NaN
... ... ...
max -30.000000 NaN
# Deskripsikan DataFrame lengkap
complete.describe()
Temperature CO2
count 8991.000000 8991.000000
mean 18.317829 1.739584
std 8.832116 1.537580
min -1.900000 0.000000
... ... ...
max 44.600000 11.900000
# Deskripsikan DataFrame yang hilang
missing.describe()
Temperature CO2
count 366.000000 0.0
mean -39.655738 NaN <--
std 5.988716 NaN
min -49.000000 NaN <--
... ... ...
max -30.000000 NaN <--
sorted_airquality = airquality.sort_values(by = 'Temperature')
msno.matrix(sorted_airquality)
plt.show()

sorted_airquality = airquality.sort_values(by = 'Temperature')
msno.matrix(sorted_airquality)
plt.show()





Pendekatan sederhana:
Pendekatan lebih kompleks:
airquality.head()
Date Temperature CO2
0 05/03/2005 8.5 2.5
1 23/08/2004 21.8 0.0
2 18/02/2005 6.3 1.0
3 08/02/2005 -31.0 NaN
4 13/03/2005 19.9 0.1
# Hapus nilai hilang
airquality_dropped = airquality.dropna(subset = ['CO2'])
airquality_dropped.head()
Date Temperature CO2
0 05/03/2005 8.5 2.5
1 23/08/2004 21.8 0.0
2 18/02/2005 6.3 1.0
4 13/03/2005 19.9 0.1
5 02/04/2005 17.0 0.8
co2_mean = airquality['CO2'].mean()
airquality_imputed = airquality.fillna({'CO2': co2_mean})
airquality_imputed.head()
Date Temperature CO2
0 05/03/2005 8.5 2.500000
1 23/08/2004 21.8 0.000000
2 18/02/2005 6.3 1.000000
3 08/02/2005 -31.0 1.739584
4 13/03/2005 19.9 0.100000
Membersihkan Data di Python