Introduction to Dask bags

Parallel Programming with Dask in Python

James Fulton

Climate Informatics Researcher

What is unstructured data?

# Unstructured text data
string_list = [
    "Really good service ...",
    "This is the second time we've stayed ...", 
    "Great older hotel. My husband took ...",
    ...]

# Semi-structured dictionary data
dict_list = [
    {"name": "Beth", "employment": [{"role": "manager", "start_date": ...}, ...]},
    {"name": "Omar", "employment": [{"role": "analyst", "start_date": ...}, ...]},
    {"name": "Fang", "employment": [{"role": "engineer", "start_date": ...}, ...]},
    ...]

Dask bags

import dask.bag as db


# Create Dask bag from list
bag_example = db.from_sequence(string_list, npartitions=5)

print(bag_example)

dask.bag<from_sequence, npartitions=5>

# Print single element from bag
print(bag_example.take(1))

('Really good service ...',)

Dask bags

import dask.bag as db

# Create Dask bag from list
bag_example = db.from_sequence(string_list, npartitions=5)
print(bag_example)

dask.bag<from_sequence, npartitions=5>

# Print two elements from bag
print(bag_example.take(2))

('Really good service ...', 'This is the second time we've stayed ...'',)

Number of elements

number_of_elements = bag_example.count()
print(number_of_elements)

<dask.bag.core.Item at ...>

print(number_of_elements.compute())

Loading in text data

import glob

filenames = glob.glob('data/*.txt')

print(filenames)

["data/file_0.txt", "data/file_1.txt", "data/file_2.txt"]

text_data_bag = db.read_text(filenames)

text_data_bag = db.read_text('data/*.txt')

print(text_data_bag)

dask.bag<bag-from-delayed, npartitions=3>

String operations

text_data_bag = db.read_text('data/*.txt')

print(text_data_bag.take(1))

('Really good service ...',)

# Convert the text to upper case
print(text_data_bag.str.lower().take(1))

('really good service ...',)

String operations

# Change 'good' to 'great' in all places
print(text_data_bag.str.replace('good', 'great').take(1))

('Really great service ...',)

# How many times does 'great' appear the first 3 elements of the bag?
print(text_data_bag.str.count('great').take(3))

(0,1,5,)

Let's practice!

Parallel Programming with Dask in Python