Machine Learning for Marketing Analytics in R
Verena Pflieger
Data Scientist at INWT Statistics
# Variances of all variables before any data preparation
lapply(dataCustomers, var)
$nOrders $salesOrdered
[1] 264.3989 [1] 202384132
$nItemsOrdered $salesSold
[1] 506.5496 [1] 9345112
$nItemsSold $returnRatio
[1] 56.35125 [1] 0.0836261
...
dataCustomers <- dataCustomers %>% scale() %>% as.data.frame()
# Check variances of all variables lapply(dataCustomers, var)
$nOrders $salesOrdered
[1] 1 [1] 1
$nItemsOrdered $salesSold
[1] 1 [1] 1
$nItemsSold $returnRatio
[1] 1 [1] 1
...
pcaCust <- prcomp(dataCustomers)
str(pcaCust, give.attr = FALSE)
List of 5
$ sdev : num [1:16] 2.1 1.84 1.3 1.2 1.12 ...
$ rotation: num [1:16, 1:16] -0.439 -0.44 -0.33 -0.384 -0.352 ...
$ center : Named num [1:16] -4.66e-17 1.90e-17 -1.24e-18 ...
$ scale : logi FALSE
$ x : num [1:989, 1:16] -11.06 -1.67 0.53 -3.39 -3.81 ...
# Standard deviations
pcaCust$sdev %>% round(2)
[1] 2.10 1.84 1.30 1.20 1.12 1.07 0.80 0.78 0.72 0.61 0.48 0.37 0.26
[14] 0.21 0.17 0.13
# Variances (Eigenvalues)
pcaCust$sdev ^ 2 %>% round(2)
[1] 4.39 3.38 1.68 1.45 1.26 1.15 0.65 0.61 0.52 0.38 0.23 0.14 0.07
[14] 0.04 0.03 0.02
# Proportion of explained variance
(pcaCust$sdev ^ 2/length(pcaCust$sdev)) %>% round(2)
[1] 0.27 0.21 0.10 0.09 0.08 0.07 0.04 0.04 0.03 0.02 0.01 0.01 0.00
[14] 0.00 0.00 0.00
# Loadings (correlations between original variables and components)
round(pcaCust$rotation[, 1:6], 2)
# Value on 1st component for 1st customer
sum(dataCustomers[1,] * pcaCust$rotation[,1])
-11.05858
pcaCust$x[1:5, 1:6]
PC1 PC2 PC3 PC4 PC5 PC6
[1,] -11.0585802 3.5750683 -4.1371495 0.28864769 -0.1045802 0.698612248
[2,] -1.6734771 -1.6630208 0.9498452 0.14091195 -1.2760898 -0.006310673
[3,] 0.5303018 -0.4672193 -0.1918865 1.77466781 0.4623840 -0.037466682
[4,] -3.3903118 -0.1274839 4.2217216 0.03710948 -0.1840454 0.164680941
[5,] -3.8069613 5.3971530 -1.2241316 -0.38341585 0.9721412 -2.142731490
Machine Learning for Marketing Analytics in R