Parallel Programming with Dask in Python
James Fulton
Climate Informatics Researcher
# Unstructured text data
string_list = [
"Really good service ...",
"This is the second time we've stayed ...",
"Great older hotel. My husband took ...",
...]
# Semi-structured dictionary data
dict_list = [
{"name": "Beth", "employment": [{"role": "manager", "start_date": ...}, ...]},
{"name": "Omar", "employment": [{"role": "analyst", "start_date": ...}, ...]},
{"name": "Fang", "employment": [{"role": "engineer", "start_date": ...}, ...]},
...]
import dask.bag as db
# Create Dask bag from list bag_example = db.from_sequence(string_list, npartitions=5)
print(bag_example)
dask.bag<from_sequence, npartitions=5>
# Print single element from bag
print(bag_example.take(1))
('Really good service ...',)
import dask.bag as db
# Create Dask bag from list
bag_example = db.from_sequence(string_list, npartitions=5)
print(bag_example)
dask.bag<from_sequence, npartitions=5>
# Print two elements from bag
print(bag_example.take(2))
('Really good service ...', 'This is the second time we've stayed ...'',)
number_of_elements = bag_example.count()
print(number_of_elements)
<dask.bag.core.Item at ...>
print(number_of_elements.compute())
20491
import glob
filenames = glob.glob('data/*.txt')
print(filenames)
["data/file_0.txt", "data/file_1.txt", "data/file_2.txt"]
text_data_bag = db.read_text(filenames)
text_data_bag = db.read_text('data/*.txt')
print(text_data_bag)
dask.bag<bag-from-delayed, npartitions=3>
text_data_bag = db.read_text('data/*.txt')
print(text_data_bag.take(1))
('Really good service ...',)
# Convert the text to upper case
print(text_data_bag.str.lower().take(1))
('really good service ...',)
# Change 'good' to 'great' in all places
print(text_data_bag.str.replace('good', 'great').take(1))
('Really great service ...',)
# How many times does 'great' appear the first 3 elements of the bag?
print(text_data_bag.str.count('great').take(3))
(0,1,5,)
Parallel Programming with Dask in Python