Scaling and Optimizing Data Pipelines with Polars
Liam Brannigan
Data Scientist & Polars Contributor



(
pl.read_csv("311_Service_Requests.csv", try_parse_dates=True)
)
(
pl.read_csv("311_Service_Requests.csv", try_parse_dates=True)
.write_parquet("311_Service_Requests.parquet")
)
pl.read_parquet_schema(
"311_Service_Requests.parquet"
)
Schema(
"CREATED_DATE": Datetime,
"TYPE": String,
"STATUS": String,
"DEPARTMENT": String,
"WARD": Int64,
...
)
$$
$$

new_request = {"TYPE": "Pothole", "STATUS": "Open", ...}

(
pl.scan_parquet("311_Service_Requests.parquet")
.filter(pl.col("CREATED_DATE") < pl.datetime(2020, 1, 1))
)


(
pl.scan_parquet(
"311_Service_Requests.parquet",
)
)
(
pl.scan_parquet(
"311_Service_Requests.parquet",
parallel="row_groups",
)
)
columnsprefilterednonedepartment_counts.write_parquet(
"department_counts.parquet",
compression_level=3,
)
department_counts.write_parquet(
"department_counts.parquet",
compression_level=3,
row_group_size=25_000,
)
Scaling and Optimizing Data Pipelines with Polars