Introduction to Testing in Python
Alexander Levin
Data Scientist
Data: salaries in data science.
Each row contains information about a data science worker with his salary, title and other attributes.
Pipeline: to get the mean salary:
import pandas as pd
# Fixture to get the data
@pytest.fixture
def read_df():
return pd.read_csv('ds_salaries.csv')
# Function to filter the data
def filter_df(df):
return df[df['employment_type'] == 'FT']
# Function to get the mean
def get_mean(df):
return df['salary_in_usd'].mean()
Test cases:
Code:
def test_read_df(read_df):
# Check the type of the dataframe
assert isinstance(read_df, pd.DataFrame)
# Check that df contains rows
assert read_df.shape[0] > 0
Example of checking that Python can create files.
def test_write():
# Opening a file in writing mode
with open('temp.txt', 'w') as wfile:
# Writing the text to the file
wfile.write('Testing stuff is awesome')
# Checking the file exists
assert os.path.exists('temp.txt')
# Don't forget to clean after yourself
os.remove('temp.txt')
Test cases:
get_mean()
function returns a numberCode:
def test_units(read_df):
filtered = filter_df(read_df)
assert filtered['employment_type'].unique() == ['FT']
assert isinstance(get_mean(filtered), float)
Test cases:
Code:
def test_feature(read_df):
# Filtering the data
filtered = filter_df(read_df)
# Test case: mean is greater than zero
assert get_mean(filtered) > 0
# Test case: mean is not bigger than the maximum
assert get_mean(filtered) <= read_df['salary_in_usd'].max()
Test cases:
Code:
def test_performance(benchmark, read_df):
# Benchmark decorator
@benchmark
# Function to measure
def get_result():
filtered = filter_df(read_df)
return get_mean(filtered)
import pytest
## Integration Tests
def test_read_df(read_df):
# Check the type of the dataframe
assert isinstance(read_df, pd.DataFrame)
# Check that df contains rows
assert read_df.shape[0] > 0
def test_write():
with open('temp.txt', 'w') as wfile:
wfile.write('12345')
assert os.path.exists('temp.txt')
os.remove('temp.txt')
## Unit Tests
def test_units(read_df):
filtered = filter_df(read_df)
assert filtered['employment_type'].unique() == ['FT']
assert isinstance(get_mean(filtered), float)
## Feature Tests
def test_feature(read_df):
# Filtering the data
filtered = filter_df(read_df)
# Test case: mean is greater than zero
assert get_mean(filtered) > 0
# Test case: mean is not bigger than the maximum
assert get_mean(filtered) <= read_df['salary_in_usd'].max()
## Performance Tests
def test_performance(benchmark, read_df):
# Benchmark decorator
@benchmark
# Function to measure
def pipeline():
filtered = filter_df(read_df)
return get_mean(filtered)
Introduction to Testing in Python