基本概念:
1、规则的支持度Support
S(A=>B)=P(A∩B)
2、规则的置信度Confidence
Conf(A=>B)=(S(A=>B))/(S(A))=P(B|A)
3、规则的提升度Lift
Lift(A=>B)=Conf(A=>B)/S(B) =(P(B|A))/P(B)
4、规则的部署能力
D=S(A)-S(A=>B)
一、超市购物篮关联规则分析
> library(arules)
> data("Groceries")
> summary(Groceries)
transactions as itemMatrix in sparse format with
9835 rows (elements/itemsets/transactions) and
169 columns (items) and a density of 0.02609146
most frequent items:
whole milk other vegetables rolls/buns soda yogurt (Other)
2513 1903 1809 1715 1372 34055
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46 29 14 14 9 11 4 6 1
26 27 28 29 32
1 1 1 3 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 3.000 4.409 6.000 32.000
includes extended item information - examples:
labels level2 level1
1 frankfurter sausage meat and sausage
2 sausage sausage meat and sausage
3 liver loaf sausage meat and sausage
> itemFrequencyPlot(Groceries,topN=15,horiz=T,main='支持度最高的15个项目',col='grey')
> apriori((Groceries)) #函数默认最小support=0.1,最小confidence=0.8,最大规则内的项目数(maxlen=10)
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.8 0.1 1 none FALSE TRUE 5 0.1 1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 983
set item appearances …[0 item(s)] done [0.00s].
set transactions …[169 item(s), 9835 transaction(s)] done [0.00s].
sorting and recoding items … [8 item(s)] done [0.00s].
creating transaction tree … done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing … [0 rule(s)] done [0.00s].
creating S4 object … done [0.00s].
set of 0 rules
默认参数下没有结果,这里更改参数如下,
> g<-apriori(Groceries,parameter=list(support=0.001,confidence=0.5))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.5 0.1 1 none FALSE TRUE 5 0.001 1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 9
set item appearances …[0 item(s)] done [0.00s].
set transactions …[169 item(s), 9835 transaction(s)] done [0.00s].
sorting and recoding items … [157 item(s)] done [0.00s].
creating transaction tree … done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.02s].
writing … [5668 rule(s)] done [0.00s].
creating S4 object … done [0.00s].
> g
set of 5668 rules
>
> inspect(head(sort(g,by='lift'),10)) #根据lift排序显示前10条规则
lhs rhs support confidence
[1] {Instant food products,soda} => {hamburger meat} 0.001220132 0.6315789
[2] {soda,popcorn} => {salty snack} 0.001220132 0.6315789
[3] {flour,baking powder} => {sugar} 0.001016777 0.5555556
[4] {ham,processed cheese} => {white bread} 0.001931876 0.6333333
[5] {whole milk,Instant food products} => {hamburger meat} 0.001525165 0.5000000
[6] {other vegetables,curd,yogurt,whipped/sour cream} => {cream cheese } 0.001016777 0.5882353
[7] {processed cheese,domestic eggs} => {white bread} 0.001118454 0.5238095
[8] {tropical fruit,other vegetables,yogurt,white bread} => {butter} 0.001016777 0.6666667
[9] {hamburger meat,yogurt,whipped/sour cream} => {butter} 0.001016777 0.6250000
[10] {tropical fruit,other vegetables,whole milk,yogurt,domestic eggs} => {butter} 0.001016777 0.6250000
lift count
[1] 18.99565 12
[2] 16.69779 12
[3] 16.40807 10
[4] 15.04549 19
[5] 15.03823 15
[6] 14.83409 10
[7] 12.44364 11
[8] 12.03058 10
[9] 11.27867 10
[10] 11.27867 10
>
可视化
> library(arulesViz)
> plot(g,measure=c('support','confidence'),shading='order',control=list(main='支持度和置信度散点图')) #measure参数指定,x,y坐标,shading参数指定散点的浓度,可以设为order(规则长度)或者lift(规则提升度)
> plot(g,measure=c('support','confidence'),shading='lift',control=list(main='支持度和置信度散点图'))
考虑到置信度接近1的比较多,可以从中提取高于0.8的规则。
> g.sub<-subset(g,subset=confidence>0.8)
> g
set of 5668 rules
> plot(g.sub,measure=c('support','confidence'),shading='lift',control=list(main='支持度和置信度散点图'))
> plot(g.sub,method='matrix',measure=c("lift","confidence"))
继续可视化,用不同的method显示不同的效果,如下:
> plot(g.sub,method='paracoord')
plot(g.sub,method='matrix3D',measure='lift',control=list('reorder=TRUE'))
> plot(g.sub,method='grouped',control=list(k=10))
二、泰坦尼克号生存规则分析
读入数据
> data(Titanic)
> T.df<-expand.table(Titanic) #转化成数据框
> class(T.df)
[1] "data.frame"
> head(T.df)
Class Sex Age Survived
1 1st Male Child Yes
2 1st Male Child Yes
3 1st Male Child Yes
4 1st Male Child Yes
5 1st Male Child Yes
6 1st Male Adult No
> T.tr<-as(T.df,'transactions') #转化成transaction形式数据
> inspect(head(T.tr))
items transactionID
[1] {Class=1st,Sex=Male,Age=Child,Survived=Yes} 1
[2] {Class=1st,Sex=Male,Age=Child,Survived=Yes} 2
[3] {Class=1st,Sex=Male,Age=Child,Survived=Yes} 3
[4] {Class=1st,Sex=Male,Age=Child,Survived=Yes} 4
[5] {Class=1st,Sex=Male,Age=Child,Survived=Yes} 5
[6] {Class=1st,Sex=Male,Age=Adult,Survived=No} 6
> T.im<-as(T.tr,'itemMatrix') #或转化成矩阵形式
> inspect(head(T.im))
items
[1] {Class=1st,Sex=Male,Age=Child,Survived=Yes}
[2] {Class=1st,Sex=Male,Age=Child,Survived=Yes}
[3] {Class=1st,Sex=Male,Age=Child,Survived=Yes}
[4] {Class=1st,Sex=Male,Age=Child,Survived=Yes}
[5] {Class=1st,Sex=Male,Age=Child,Survived=Yes}
[6] {Class=1st,Sex=Male,Age=Adult,Survived=No}
>tr<-read.transcations(file='文件名',format='single',clos=c(TIDcol,ITEMcol),sep=',') #读入交易数据格式的csv文件
执行分析
> T.ar<apriori(T.tr)
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.8 0.1 1 none FALSE TRUE 5 0.1 1 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 220
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[10 item(s), 2201 transaction(s)] done [0.00s].
sorting and recoding items ... [9 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [27 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
Error in T.ar < apriori(T.tr) :
comparison (3) is possible only for atomic and list types
> inspect(head(T.ar))
lhs rhs support confidence lift count
[1] {} => {Age=Adult} 0.9504771 0.9504771 1.0000000 2092
[2] {Class=2nd} => {Age=Adult} 0.1185825 0.9157895 0.9635051 261
[3] {Class=1st} => {Age=Adult} 0.1449341 0.9815385 1.0326798 319
[4] {Sex=Female} => {Age=Adult} 0.1930940 0.9042553 0.9513700 425
[5] {Class=3rd} => {Age=Adult} 0.2848705 0.8881020 0.9343750 627
[6] {Survived=Yes} => {Age=Adult} 0.2971377 0.9198312 0.9677574 654
> inspect(subset(T.ar,subset=rhs %in% 'Survived=Yes' | rhs %in% 'Survived=No')) #我们只关心Survived的结论项集,所以提取
lhs rhs support confidence lift count
[1] {Class=3rd,Sex=Male} => {Survived=No} 0.1917310 0.8274510 1.222295 422
[2] {Class=3rd,Sex=Male,Age=Adult} => {Survived=No} 0.1758292 0.8376623 1.237379 387
默认设置下得到的规则较少,因此重新设置参数,再次提取关联规则:
> T.ar2<-apriori(T.tr,parameter=list(support=0.005,confidence=0.8,minlen=2),appearance=list(rhs=c('Survived=Yes','Survived=No'),default='lhs'))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.8 0.1 1 none FALSE TRUE 5 0.005 2 10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 11
set item appearances ...[2 item(s)] done [0.00s].
set transactions ...[10 item(s), 2201 transaction(s)] done [0.00s].
sorting and recoding items ... [10 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [12 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
> T.ar2
set of 12 rules
> inspect(T.ar2)
lhs rhs support confidence lift count
[1] {Class=2nd,Age=Child} => {Survived=Yes} 0.010904134 1.0000000 3.095640 24
[2] {Class=2nd,Sex=Female} => {Survived=Yes} 0.042253521 0.8773585 2.715986 93
[3] {Class=2nd,Sex=Male} => {Survived=No} 0.069968196 0.8603352 1.270871 154
[4] {Class=1st,Sex=Female} => {Survived=Yes} 0.064061790 0.9724138 3.010243 141
[5] {Class=Crew,Sex=Female} => {Survived=Yes} 0.009086779 0.8695652 2.691861 20
[6] {Class=3rd,Sex=Male} => {Survived=No} 0.191731031 0.8274510 1.222295 422
[7] {Class=2nd,Sex=Female,Age=Child} => {Survived=Yes} 0.005906406 1.0000000 3.095640 13
[8] {Class=2nd,Sex=Female,Age=Adult} => {Survived=Yes} 0.036347115 0.8602151 2.662916 80
[9] {Class=2nd,Sex=Male,Age=Adult} => {Survived=No} 0.069968196 0.9166667 1.354083 154
[10] {Class=1st,Sex=Female,Age=Adult} => {Survived=Yes} 0.063607451 0.9722222 3.009650 140
[11] {Class=Crew,Sex=Female,Age=Adult} => {Survived=Yes} 0.009086779 0.8695652 2.691861 20
[12] {Class=3rd,Sex=Male,Age=Adult} => {Survived=No} 0.175829169 0.8376623 1.237379 387
> plot(T.ar2,method='grouped',control = list(k=6))
可视化
> plot(T.ar2,method='grouped',control = list(k=6))
> plot(T.ar2,method='paracoord')
将结论生成csv文件,
> write.csv(as(T.ar2,'data.frame'),'titanicar2.csv')
用excel打开如下所示: