第2关:动手实现Apriori算法
def createC1 ( dataset) :
C1 = set ( )
for t in dataset:
for item in t:
item_set = frozenset ( [ item] )
C1. add( item_set)
return C1
def scanD ( D, ck, minsupport) :
ssCnt = { }
for tid in D:
for can in ck:
if can. issubset( tid) :
if can not in ssCnt. keys( ) :
ssCnt[ can] = 1
else :
ssCnt[ can] += 1
numItems = len ( D)
reList = [ ]
supportData = { }
for key in ssCnt:
support = ssCnt[ key] / numItems
if support >= minsupport:
reList. insert( 0 , key)
supportData[ key] = support
return reList, supportData
def aprioriGen ( Lk, k) :
retList = [ ]
lenLk = len ( Lk)
for i in range ( lenLk) :
for j in range ( i+ 1 , lenLk) :
L1 = list ( Lk[ i] ) [ k: - 2 ]
L2 = list ( Lk[ j] ) [ : k- 2 ]
if L1 == L2:
retList. append( Lk[ i] | Lk[ j] )
return retList
def apriori ( dataSet, minSupport) :
C1 = createC1( dataSet)
D = list ( map ( set , dataSet) )
L1, supportData = scanD( D, C1, minSupport)
L = [ L1]
k = 2
while ( len ( L[ k - 2 ] ) > 0 ) :
Ck = aprioriGen( L[ k - 2 ] , k)
Lk, supK = scanD( D, Ck, minSupport)
supportData. update( supK)
L. append( Lk)
k += 1
return L, supportData
第3关:从频繁项集中挖掘关联规则
from utils import apriori, aprioriGen
def calcConf ( freqSet, H, supportData, brl, minConf = 0.7 ) :
prunedH = [ ]
for conseq in H:
conf = supportData[ freqSet] / supportData[ freqSet - conseq]
if conf >= minConf:
brl. append( ( freqSet - conseq, conseq, conf) )
prunedH. append( conseq)
return prunedH
def ruleFromConseq ( freqSet, H, supportData, brl, minConf = 0.7 ) :
m = len ( H[ 0 ] )
if len ( freqSet) > m+ 1 :
Hmp1 = aprioriGen( H, m+ 1 )
Hmp1 = calcConf( freqSet, Hmp1, supporData, brl, minConf)
if len ( Hmp1) > 1 :
ruleFromConseq( freqSet, Hmp1, supportData, brl, minConf)
def generateRules ( dataset, minsupport, minConf) :
'''
生成关联规则,可以使用apriori函数获得数据集中的频繁项集列表与支持度
:param dataset:数据集,类型为list
:param minsupport:最小支持度,类型为float
:param minConf:最小可信度,类型为float
:return:关联规则列表,类型为list
'''
digRuleList = [ ]
L, supportData = apriori( dataset, minsupport)
for i in range ( 1 , len ( L) ) :
for freqSet in L[ i] :
H1 = [ frozenset ( [ item] ) for item in freqSet]
if i > 1 :
ruleFromConseq( freqSet, H1, supportData, digRuleList, minConf)
else :
calcConf( freqSet, H1, supportData, digRuleList, minConf)
return digRuleList
第4关:超市购物清单关联规则分析
from utils import generateRules
import pandas as pd
def T ( x) :
m = { 'yogurt' : 1 , 'pork' : 2 , 'sandwich bags' : 3 , 'lunch meat' : 4 , 'all- purpose' : 5 , 'flour' : 6 , 'soda' : 7 , 'butter' : 8 ,
'vegetables' : 9 , 'beef' : 10 , 'aluminum foil' : 11 , 'dinner rolls' : 12 , 'shampoo' : 13 , 'mixes' : 14 , 'soap' : 15 ,
'laundry detergent' : 16 , 'ice cream' : 17 , 'toilet paper' : 18 , 'hand soap' : 19 , 'waffles' : 20 , 'cheeses' : 21 ,
'milk' : 22 , 'dishwashing liquid/detergent' : 23 , 'individual meals' : 24 , 'cereals' : 25 , 'tortillas' : 26 ,
'spaghetti sauce' : 27 , 'ketchup' : 28 , 'sandwich loaves' : 29 , 'poultry' : 30 , 'bagels' : 31 , 'eggs' : 32 , 'juice' : 33 ,
'pasta' : 34 , 'paper towels' : 35 , 'coffee/tea' : 36 , 'fruits' : 37 , 'sugar' : 38 }
return m[ x]
def aprior_data ( data) :
basket = [ ]
for id in data[ 'id' ] . unique( ) :
a = [ data[ 'good' ] [ i] for i, j in enumerate ( data[ 'id' ] ) if j == id ]
basket. append( a)
return basket
def genRules ( data_path, min_support, min_conf) :
data1 = pd. read_csv( data_path)
data1[ 'good' ] = data1[ 'good' ] . apply ( T)
data2 = aprior_data( data1)
rult = generateRules( data2, min_support, min_conf)
return rult