Pemrograman Paralel dengan Dask di Python
James Fulton
Climate Informatics Researcher


files = [
'2005_tracks.csv',
'2006_tracks.csv',
'2007_tracks.csv',
'2008_tracks.csv',
'2009_tracks.csv',
'2010_tracks.csv',
...
'2020_tracks.csv',
]
name duration_ms release_date ...
0 Aldrig (feat. Carmon) 247869 2019-01-01 ...
2 2019 - The Year to Build 288105 2019-01-01 ...
3 Na zawsze 186812 2019-01-01 ...
4 Humo en la Trampa 258354 2019-01-01 ...
5 Au Au 176000 2019-01-01 ...
... ... ... ... ...
import pandas as pd maximums = [] for file in files: # Muat tiap file df = pd.read_csv(file)# Cari durasi lagu maksimum di tiap file max_length = df['duration_ms'].max()# Simpan nilai maksimum maximums.append(max_length)# Cari maksimum dari semua nilai maksimum absolute_maximum = max(maximums)
import pandas as pd
maximums = []
for file in files:
# Muat tiap file
df = delayed(pd.read_csv)(file) # <------- tunda pemuatan
# Cari durasi lagu maksimum di tiap file
max_length = df['duration_ms'].max()
# Simpan nilai maksimum
maximums.append(max_length)
# Cari maksimum dari semua nilai maksimum
absolute_maximum = max(maximums)
import pandas as pd
maximums = []
for file in files:
# Muat tiap file
df = delayed(pd.read_csv)(file) # <------- tunda pemuatan
# Cari durasi lagu maksimum di tiap file
max_length = df['duration_ms'].max()
# Simpan nilai maksimum
maximums.append(max_length)
# Cari maksimum dari semua nilai maksimum
absolute_maximum = delayed(max)(maximums) # <------- tunda fungsi max()
import pandas as pd
maximums = []
for file in files:
df = delayed(pd.read_csv)(file)
# Gunakan metode .max()
max_length = df['duration_ms'].max()
maximums.append(max_length)
absolute_maximum = delayed(max)(maximums)
print(max_length)
Delayed('max-0602855d-3ee6-4c43-a4d2-...')
print(df.shape)
print(df.shape.compute())
Delayed('getattr-bc1e8838ab...')
(11907, 12)
import pandas as pd
maximums = []
for file in files:
df = delayed(pd.read_csv)(file)
# Gunakan metode yang tidak ada
max_length = df['duration_ms'].fake()
maximums.append(max_length)
absolute_maximum = delayed(max)(maximums)
print(max_length)
Delayed('max-6c026036-5daf-4b2-...')
.compute() dipanggilprint(max_length.compute())
...
AttributeError: 'Series' object has no
attribute 'fake'
import pandas as pd
maximums = []
for file in files:
df = delayed(pd.read_csv)(file)
max_length = df['duration_ms'].max()
# Tambahkan objek tertunda ke daftar
maximums.append(max_length)
# Jalankan max tertunda pada daftar objek tertunda
absolute_maximum = delayed(max)(maximums)
maximums adalah daftar objek tertunda
print(maximums)
[Delayed('max-80b...'),
Delayed('max-fa15d...',
...]
import pandas as pd
maximums = []
for file in files:
df = delayed(pd.read_csv)(file)
max_length = df['duration_ms'].max()
# Tambahkan objek tertunda ke daftar
maximums.append(max_length)
# Hitung semua nilai maksimum
all_maximums = dask.compute(maximums)
print(all_maximums)
([2539418, 4368000, ...
... 4511716, 4864333],)
import pandas as pd
maximums = []
for file in files:
df = delayed(pd.read_csv)(file)
max_length = df['duration_ms'].max()
maximums.append(max_length)
# Hitung semua nilai maksimum
all_maximums = dask.compute(maximums)[0]
print(all_maximums)
[2539418, 4368000, ...
... 4511716, 4864333]
def get_max_track(df):
return df['duration_ms'].max()
for file in files:
df = delayed(pd.read_csv)(file)
# Gunakan fungsi untuk mencari maksimum
max_length = get_max_track(df)
maximums.append(max_length)
absolute_maximum = delayed(max)(maximums)
absolute_maximum.visualize()

Pemrograman Paralel dengan Dask di Python