Parallel Programming with Dask in Python
James Fulton
Climate Informatics Researcher
example_0.json
{"name": "Beth", "employment": [{"role": "manager", "start_date": ...}, ...], ...}
{"name": "Omar", "employment": [{"role": "analyst", "start_date": ...}, ...], ...}
{"name": "Fang", "employment": [{"role": "engineer", "start_date": ...}, ...], ...}
...
def add_number_of_jobs(employee_dict):
employee_dict['number_of_previous_jobs'] = len(employee_dict['employment'])
return employee_dict
dict_bag = dict_bag.map(add_number_of_jobs)
def delete_dictionary_entry(dictionary, key_to_drop):
del dictionary[key_to_drop]
return dictionary
dict_bag = dict_bag.map(delete_dictionary_entry, key_to_drop='employment')
def filter_dictionary(dictionary, keys_to_keep):
new_dict = {}
for k in keys_to_keep:
new_dict[k] = dictionary[k]
return new_dict
dict_bag = dict_bag.map(
filter_dictionary,
keys_to_keep=['name', 'number_of_previous_jobs']
)
print(dict_bag.take(1))
({'name': 'Beth',
'number_of_previous_jobs': 3},)
converted_bag_df = dict_bag.to_dataframe()
print(converted_bag_df)
name number_of_previous_jobs
npartitions=3
object float64
... ...
Parallel Programming with Dask in Python