ETL and ELT in Python
Jake Roach
Data Engineer
import pandas as pd
# Read in the CSV file to a DataFrame
data_frame = pd.read_csv("raw_data.csv")
# Output the first few rows
data_frame.head()
name num_firms total_income
0 Advertising 58 3892.41
1 Apparel 39 5422.69
...
49 Trucking 35 17324.36
read_csv()
delimiter
, header
, engine
$$
.head()
n
number of a DataFrame name num_firms total_income
0 Advertising 58 3892.41
1 Apparel 39 5422.69
...
49 Trucking 35 17324.36
name num_firms
1 Apparel 39
37 Apparel 61
# First, by rows
data_frame.loc[data_frame["name"] == "Apparel", :]
# Then, by columns
data_frame.loc[:, ["name", "num_firms"]]
.loc
:
means "all"# Write a DataFrame to a .csv file
data_frame.to_csv("cleaned_data.csv")
.to_csv()
path
, creates DataFrame from file stored at that path
$$
Other options, like:
.to_json()
, .to_excel()
, .to_sql()
data_warehouse.execute( # Use Python clients or other tools to run SQL queries
"""
CREATE TABLE total_sales AS
SELECT
ds,
SUM(sales)
FROM raw_sales_data
GROUP BY ds;
"""
)
.execute()
to run SQL queries# Define extract(), transform(), and load() functions
...
def transform(data_frame, value):
return data_frame.loc[data_frame["name"] == value, ["name", "num_firms"]]
# First, extract data from a .csv
extracted_data = extract(file_name="raw_data.csv")
# Then, transform the `extracted_data`
transformed_data = transform(data_frame=extracted_data, value="Apparel")
# Finally, load the `transformed_data`
load(data_frame=transformed_data, file_name="cleaned_data.csv")
ETL and ELT in Python