Customer Analytics and A/B Testing in Python
Ryan Grossman
Data Scientist, EDO
(conv_rate - mean) / sd
# Purchase rate grouped by user and test group
results.head(n=10)
uid group purchase
11128497.0 V 0.000000
11145206.0 V 0.050000
11163353.0 C 0.150000
11215368.0 C 0.000000
11248473.0 C 0.157895
11258429.0 V 0.086957
11271484.0 C 0.071429
11298958.0 V 0.157895
11325422.0 C 0.045455
11340821.0 C 0.040000
# Break out our user groups var = results[results.group == 'V'] con = results[results.group == 'C']
# plot our conversion rate data for each group plt.hist(var['purchase'],color = 'yellow', alpha = 0.8, bins =50, label = 'Test') plt.hist(con['purchase'], color = 'blue', alpha = 0.8, bins = 50, label = 'Control')
plt.legend(loc='upper right')
plt.axvline()
: Draw a vertical line of the specified color# Draw annotation lines at the mean values
# for each group
plt.axvline(x = np.mean(results.purchase),
color = 'red')
plt.axvline(x= np.mean(results.purchase),
color = 'green')
plt.show()
# Use our mean values to calculate the variance mean_con = 0.090965 mean_test = 0.102005 var_con = (mean_con * (1 - mean_con)) / 58583 var_test = (mean_test * (1 - mean_test)) / 56350
# Generate a range of values across the # distribution from +/- 3 sd around the mean con_line = np.linspace(-3 * var_con**0.5 + mean_con, 3 * var_con**0.5 + mean_con, 100) test_line = np.linspace(-3 * var_test**0.5 + mean_test, 3 * var_test**0.5 + mean_test, 100)
from scipy.stats import norm # Plot the probabilities across the distribution of conversion rates plt.plot(con_line,norm.pdf( con_line, mean_con,var_con**0.5) ) plt.plot(test_line,norm.pdf( test_line, mean_test, var_test**0.5) )
plt.show()
norm.pdf()
: Converts values to probabilities from Normal distributionlift = mean_test - mean_control
var = var_test + var_control
# Plot our difference in conversion rates
# as a distribution
diff_line = np.linspace(-3 * var**0.5 + lift,
3 * var**0.5 + lift, 100
)
plt.plot(diff_line,norm.pdf(
diff_line, lift, var**0.5)
)
plt.show()
# Find values over our confidence interval section = np.arange(0.007624, 0.01445 , 1/10000)
# Fill in between those boundaries plt.fill_between( section, norm.pdf(section,lift, var**0.5) )
# Plot the difference with the confidence int. plt.plot( diff_line, norm.pdf(diff_line, lift, var**0.5) ) plt.show()
np.arange()
: Generate points in an intervalplt.fill_between()
: Fill in an intervalCustomer Analytics and A/B Testing in Python