Scaling and Optimizing Data Pipelines with Polars
Liam Brannigan
Data Scientist & Polars Contributor
events = pl.read_parquet("chicago_events.parquet")
events.select("event_title","tags","visitors","profile","price").head(5)
shape: (5, 5)
| event_title | tags | visitors | profile | price |
| --- | --- | --- | --- | --- |
| str | list[str] | i64 | i64 | f64 |
|--------------------|-----------------|----------|---------|-------|
| Folk Festival | ["crafts",...] | 2600 | 3 | 25.0 |
| Fireworks Night | ["family",...] | 32000 | 4 | 0.0 |
| Greektown Market | ["food",...] | 1900 | 3 | 0.0 |
| Rail History Day | ["history",...] | 880 | 1 | 7.5 |
| Grant Park Concert | ["family",...] | 50000 | 5 | 35.0 |
events.estimated_size()
179400000
events.estimated_size("mb")
171.1
gb for gigabytestb for terabytesevents_cat = events.with_columns(
pl.col("area").cast(pl.Categorical),
)
events_cat = events.with_columns(
pl.col("area").cast(pl.Categorical),
pl.col("tags").cast(pl.List(pl.Categorical))
)
events_cat = events.with_columns(
pl.col("area").cast(pl.Categorical),
pl.col("tags").cast(pl.List(pl.Categorical)),
pl.col("venue_context").struct.with_fields(
)
)
events_cat = events.with_columns(
pl.col("area").cast(pl.Categorical),
pl.col("tags").cast(pl.List(pl.Categorical)),
pl.col("venue_context").struct.with_fields(
pl.field(["venue_type","venue_space"]).cast(pl.Categorical)
)
)
events_cat.select("event_title","area","tags","venue_context").head(3)
shape: (3, 4)
| event_title | area | tags | venue_context |
| --- | --- | --- | --- |
| str | cat | list[cat] | struct[2] |
|------------------|-----------|---------------------------|------------------------|
| Chef Showcase | West Loop | ["food", "chef_demo"] | {"Food Hall", ...} |
| Fireworks Night | Downtown | ["nightlife", "family"] | {"Pier", "Mixed"} |
| Rail History Day | Pullman | ["history", "family",...] | {"Historic Site", ...} |
events.estimated_size("mb")
171.1
events_cat.estimated_size("mb")
146.4
events.select("event_title","visitors","profile","price").head()
shape: (5, 4)
| event_title | visitors | profile | price |
| --- | --- | --- | --- |
| str | i64 | i64 | f64 |
|--------------------|----------|---------|-------|
| Folk Festival | 2600 | 3 | 25.0 |
| Fireworks Night | 32000 | 4 | 0.0 |
| Greektown Market | 1900 | 3 | 0.0 |
| Rail History Day | 880 | 1 | 7.5 |
| Grant Park Concert | 50000 | 5 | 35.0 |
events.select(
pl.col("visitors").max(),
pl.col("visitors").upper_bound().alias("upper_bound"),
pl.col("visitors").lower_bound().alias("lower_bound"),
)
shape: (1, 3)
| visitors | upper_bound | lower_bound |
| --- | --- | --- |
| i64 | i64 | i64 |
|----------|----------------------|----------------------|
| 1000000 | 9223372036854775807 | -9223372036854775808 |
events.select(
pl.col("visitors").max(),
pl.col("visitors").cast(pl.Int32).upper_bound().alias("int32"),
pl.col("visitors").cast(pl.Int16).upper_bound().alias("int16"),
pl.col("visitors").cast(pl.Int8).upper_bound().alias("int8")
)
shape: (1, 4)
| visitors | int32 | int16 | int8 |
| --- | --- | --- | --- |
| i64 | i32 | i16 | i8 |
|----------|------------|-------|------|
| 1000000 | 2147483647 | 32767 | 127 |
events.select(pl.col("event_title","price")).head()
shape: (5, 2)
| event_title | price |
| --- | --- |
| str | f64 |
|--------------------|-------|
| Folk Festival | 25.00 |
| Fireworks Night | 0.00 |
| Greektown Market | 0.00 |
| Rail History Day | 7.50 |
| Grant Park Concert | 35.00 |
events.select(
pl.col("visitors").cast(pl.Int32),
pl.col("profile").cast(pl.Int8),
pl.col("price").cast(pl.Float32)
).head(3)
shape: (3, 3)
| visitors | profile | price |
| --- | --- | --- |
| i32 | i8 | f32 |
|----------|---------|-------|
| 2600 | 3 | 25.0 |
| 32000 | 4 | 0.0 |
| 1900 | 3 | 0.0 |
events.with_columns(
pl.col("area").cast(pl.Categorical),
pl.col("tags").cast(pl.List(pl.Categorical)),
pl.col("venue_context").struct.with_fields(
pl.field(["venue_type","venue_space"]).cast(pl.Categorical)
),
pl.col("visitors").cast(pl.Int32),
pl.col("profile").cast(pl.Int8),
pl.col("price").cast(pl.Float32)
).estimated_size("mb")
131.1
Scaling and Optimizing Data Pipelines with Polars