Mining association rules

Market Basket Analysis in R

Christopher Bruffaerts

Statistician

Frequent itemsets with the apriori

Extracting frequent itemsets of min size 2

# Extract the set of most frequent itemsets
itemsets_freq2 = 
  apriori(Groceries,
          parameter = list(supp = 0.01, 
                           minlen = 2,
                           target = 'frequent'
                         ))

Sorting and inspecting frequent itemsets

inspect(head(sort(itemsets_freq2, by="support")))
   items                              support    count
[1] {other vegetables,whole milk}      0.07483477 736  
[2] {whole milk,rolls/buns}            0.05663447 557  
[3] {whole milk,yogurt}                0.05602440 551  
[4] {root vegetables,whole milk}       0.04890696 481  
[5] {root vegetables,other vegetables} 0.04738180 466  
[6] {other vegetables,yogurt}          0.04341637 427
Market Basket Analysis in R

Rules with the apriori

rules = apriori(Groceries, parameter = list(supp=.001,
                                            conf=.5,
                                            minlen=2,
                                            target='rules'
                 ))
inspect(head(sort(rules, by="confidence")))
   lhs                                           rhs                support     confidence lift     count
[1] {rice,sugar}                               => {whole milk}       0.001220132 1          3.913649 12   
[2] {canned fish,hygiene articles}             => {whole milk}       0.001118454 1          3.913649 11   
[3] {root vegetables,butter,rice}              => {whole milk}       0.001016777 1          3.913649 10   
[4] {root vegetables,whipped/sour cream,flour} => {whole milk}       0.001728521 1          3.913649 17   
[5] {butter,soft cheese,domestic eggs}         => {whole milk}       0.001016777 1          3.913649 10   
[6] {citrus fruit,root vegetables,soft cheese} => {other vegetables} 0.001016777 1          5.168156 10
Market Basket Analysis in R

Choose parameters arules

Looping over different confidence values

# Set of confidence levels
confidenceLevels = seq(from=0.1, to=0.9, by =0.1)

# Create empty vector
rules_sup0005 = NULL

# Apriori algorithm with a support level of 0.5%
for (i in 1:length(confidenceLevels)) {
  rules_sup0005[i] = 
    length(apriori(Groceries, 
                   parameter=list(supp=0.005,
                                  conf=confidenceLevels[i],
                                  target="rules")))
}

library(ggplot2)
# Number of rules found with a support level of 0.5%
qplot(confidenceLevels, rules_sup0005, 
      geom=c("point", "line"),xlab="Confidence level",
      ylab="Number of rules found") +
  theme_bw()

nb_rules_groceries

Market Basket Analysis in R

Subsetting rules

# Subsetting rules
inspect(subset(rules, subset = 
               items %in% c("soft cheese","whole milk") &
               confidence >.95))
     lhs                                                             rhs                support     confidence lift     count
[1]  {rice,sugar}                                                 => {whole milk}       0.001220132 1          3.913649 12   
[2]  {canned fish,hygiene articles}                               => {whole milk}       0.001118454 1          3.913649 11   
[3]  {root vegetables,butter,rice}                                => {whole milk}       0.001016777 1          3.913649 10

Flexibility of subsetting

inspect(subset(rules, subset=items %ain% c("soft cheese","whole milk") & confidence >.95))
inspect(subset(rules, subset=rhs %in% "whole milk" & lift >3 & confidence >0.95))
Market Basket Analysis in R

Let's mine the movie dataset!

Market Basket Analysis in R

Preparing Video For Download...