Market Basket Analysis in R
Christopher Bruffaerts
Statistician
Extracting frequent itemsets of min size 2
# Extract the set of most frequent itemsets
itemsets_freq2 =
apriori(Groceries,
parameter = list(supp = 0.01,
minlen = 2,
target = 'frequent'
))
Sorting and inspecting frequent itemsets
inspect(head(sort(itemsets_freq2, by="support")))
items support count
[1] {other vegetables,whole milk} 0.07483477 736
[2] {whole milk,rolls/buns} 0.05663447 557
[3] {whole milk,yogurt} 0.05602440 551
[4] {root vegetables,whole milk} 0.04890696 481
[5] {root vegetables,other vegetables} 0.04738180 466
[6] {other vegetables,yogurt} 0.04341637 427
rules = apriori(Groceries, parameter = list(supp=.001,
conf=.5,
minlen=2,
target='rules'
))
inspect(head(sort(rules, by="confidence")))
lhs rhs support confidence lift count
[1] {rice,sugar} => {whole milk} 0.001220132 1 3.913649 12
[2] {canned fish,hygiene articles} => {whole milk} 0.001118454 1 3.913649 11
[3] {root vegetables,butter,rice} => {whole milk} 0.001016777 1 3.913649 10
[4] {root vegetables,whipped/sour cream,flour} => {whole milk} 0.001728521 1 3.913649 17
[5] {butter,soft cheese,domestic eggs} => {whole milk} 0.001016777 1 3.913649 10
[6] {citrus fruit,root vegetables,soft cheese} => {other vegetables} 0.001016777 1 5.168156 10
Looping over different confidence values
# Set of confidence levels
confidenceLevels = seq(from=0.1, to=0.9, by =0.1)
# Create empty vector
rules_sup0005 = NULL
# Apriori algorithm with a support level of 0.5%
for (i in 1:length(confidenceLevels)) {
rules_sup0005[i] =
length(apriori(Groceries,
parameter=list(supp=0.005,
conf=confidenceLevels[i],
target="rules")))
}
library(ggplot2)
# Number of rules found with a support level of 0.5%
qplot(confidenceLevels, rules_sup0005,
geom=c("point", "line"),xlab="Confidence level",
ylab="Number of rules found") +
theme_bw()
# Subsetting rules
inspect(subset(rules, subset =
items %in% c("soft cheese","whole milk") &
confidence >.95))
lhs rhs support confidence lift count
[1] {rice,sugar} => {whole milk} 0.001220132 1 3.913649 12
[2] {canned fish,hygiene articles} => {whole milk} 0.001118454 1 3.913649 11
[3] {root vegetables,butter,rice} => {whole milk} 0.001016777 1 3.913649 10
Flexibility of subsetting
inspect(subset(rules, subset=items %ain% c("soft cheese","whole milk") & confidence >.95))
inspect(subset(rules, subset=rhs %in% "whole milk" & lift >3 & confidence >0.95))
Market Basket Analysis in R