Reducing memory pressure

Scaling and Optimizing Data Pipelines with Polars

Liam Brannigan

Data Scientist & Polars Contributor

Events dataset

events = pl.read_parquet("chicago_events.parquet")
Scaling and Optimizing Data Pipelines with Polars

Events dataset

events.select("event_title","tags","visitors","profile","price").head(5)
shape: (5, 5)
| event_title        | tags            | visitors | profile | price |
| ---                | ---             | ---      | ---     | ---   |
| str                | list[str]       | i64      | i64     | f64   |
|--------------------|-----------------|----------|---------|-------|
| Folk Festival      | ["crafts",...]  | 2600     | 3       | 25.0  |
| Fireworks Night    | ["family",...]  | 32000    | 4       | 0.0   |
| Greektown Market   | ["food",...]    | 1900     | 3       | 0.0   |
| Rail History Day   | ["history",...] | 880      | 1       | 7.5   |
| Grant Park Concert | ["family",...]  | 50000    | 5       | 35.0  |
Scaling and Optimizing Data Pipelines with Polars

Estimating memory use

events.estimated_size()
179400000
Scaling and Optimizing Data Pipelines with Polars

Estimating memory use

events.estimated_size("mb")
171.1
  • gb for gigabytes
  • tb for terabytes
Scaling and Optimizing Data Pipelines with Polars

Encoding repeated strings

events_cat = events.with_columns(
    pl.col("area").cast(pl.Categorical),




)
Scaling and Optimizing Data Pipelines with Polars

Encoding repeated strings

events_cat = events.with_columns(
    pl.col("area").cast(pl.Categorical),
    pl.col("tags").cast(pl.List(pl.Categorical))



)
Scaling and Optimizing Data Pipelines with Polars

Encoding repeated strings

events_cat = events.with_columns(
    pl.col("area").cast(pl.Categorical),
    pl.col("tags").cast(pl.List(pl.Categorical)), 
    pl.col("venue_context").struct.with_fields(

    )
)
Scaling and Optimizing Data Pipelines with Polars

Encoding repeated strings

events_cat = events.with_columns(
    pl.col("area").cast(pl.Categorical),
    pl.col("tags").cast(pl.List(pl.Categorical)), 
    pl.col("venue_context").struct.with_fields(
        pl.field(["venue_type","venue_space"]).cast(pl.Categorical)
    )
)
Scaling and Optimizing Data Pipelines with Polars

Encoding repeated strings

events_cat.select("event_title","area","tags","venue_context").head(3)
shape: (3, 4)
| event_title      | area      | tags                      | venue_context          |
| ---              | ---       | ---                       | ---                    |
| str              | cat       | list[cat]                 | struct[2]              |
|------------------|-----------|---------------------------|------------------------|
| Chef Showcase    | West Loop | ["food", "chef_demo"]     | {"Food Hall", ...}     |
| Fireworks Night  | Downtown  | ["nightlife", "family"]   | {"Pier", "Mixed"}     |
| Rail History Day | Pullman   | ["history", "family",...] | {"Historic Site", ...} |
Scaling and Optimizing Data Pipelines with Polars

Encoding repeated strings

events.estimated_size("mb")
171.1
events_cat.estimated_size("mb")
146.4
Scaling and Optimizing Data Pipelines with Polars

Numeric event data

events.select("event_title","visitors","profile","price").head()
shape: (5, 4)
| event_title        | visitors | profile | price |
| ---                | ---      | ---     | ---   |
| str                | i64      | i64     | f64   |
|--------------------|----------|---------|-------|
| Folk Festival      | 2600     | 3       | 25.0  |
| Fireworks Night    | 32000    | 4       | 0.0   |
| Greektown Market   | 1900     | 3       | 0.0   |
| Rail History Day   | 880      | 1       | 7.5   |
| Grant Park Concert | 50000    | 5       | 35.0  |
Scaling and Optimizing Data Pipelines with Polars

Integer range

events.select(
    pl.col("visitors").max(),
    pl.col("visitors").upper_bound().alias("upper_bound"),
    pl.col("visitors").lower_bound().alias("lower_bound"),
)
shape: (1, 3)
| visitors | upper_bound          | lower_bound          |
| ---      | ---                  | ---                  |
| i64      | i64                  | i64                  |
|----------|----------------------|----------------------|
| 1000000  | 9223372036854775807  | -9223372036854775808 |
Scaling and Optimizing Data Pipelines with Polars

Integer range

events.select(
    pl.col("visitors").max(),
    pl.col("visitors").cast(pl.Int32).upper_bound().alias("int32"),
    pl.col("visitors").cast(pl.Int16).upper_bound().alias("int16"), 
    pl.col("visitors").cast(pl.Int8).upper_bound().alias("int8")
)
shape: (1, 4)
| visitors | int32      | int16 | int8 |
| ---      | ---        | ---   | ---  |
| i64      | i32        | i16   | i8   |
|----------|------------|-------|------|
| 1000000  | 2147483647 | 32767 | 127  |
Scaling and Optimizing Data Pipelines with Polars

Float precision

events.select(pl.col("event_title","price")).head()
shape: (5, 2)
| event_title        | price |
| ---                | ---   |
| str                | f64   |
|--------------------|-------|
| Folk Festival      | 25.00 |
| Fireworks Night    | 0.00  |
| Greektown Market   | 0.00  |
| Rail History Day   | 7.50  |
| Grant Park Concert | 35.00 |
Scaling and Optimizing Data Pipelines with Polars

Downcasting numeric columns

events.select(
    pl.col("visitors").cast(pl.Int32),
    pl.col("profile").cast(pl.Int8),
    pl.col("price").cast(pl.Float32)
).head(3)
shape: (3, 3)
| visitors | profile | price |
| ---      | ---     | ---   |
| i32      | i8      | f32   |
|----------|---------|-------|
| 2600     | 3       | 25.0  |
| 32000    | 4       | 0.0   |
| 1900     | 3       | 0.0   |
Scaling and Optimizing Data Pipelines with Polars

Measuring the memory change

events.with_columns(
    pl.col("area").cast(pl.Categorical),
    pl.col("tags").cast(pl.List(pl.Categorical)), 
    pl.col("venue_context").struct.with_fields(
        pl.field(["venue_type","venue_space"]).cast(pl.Categorical)
    ),
    pl.col("visitors").cast(pl.Int32),
    pl.col("profile").cast(pl.Int8),
    pl.col("price").cast(pl.Float32)
).estimated_size("mb")
131.1
Scaling and Optimizing Data Pipelines with Polars

Let's practice!

Scaling and Optimizing Data Pipelines with Polars

Preparing Video For Download...