用Apriori或者FP-growth算法挖掘出所有的频繁项集,并写出具体的实现代码。假设事务数据库D表1:最小支持度计数为2. 下图是以Apriori算法为例。
表1 事务数据库D
Tid | Items |
10 | A,C,D |
20 | B,C,E |
30 | A,B,C,E |
40 | B,E |
一、实验代码
# 加载数据集
def loadDataSet() :
dataSet = [['A', 'C', 'D'],
['B', 'C', 'E'],
['A', 'B', 'C', 'E'],
['B', 'E']]
return dataSet
# 选取数据集的非重复元素组成候选集的集合C1
def createC1(dataSet) :
C1 = []
for transaction in dataSet :
for item in transaction :
if not [item] in C1 :
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
# 由Ck产生Lk:扫描数据集dataSet,计算候选集Ck各元素在dataSet中的支持度,选取支持度大于设定值的元素进入Lk
def scanD(D, Ck, minSupport) :
ssCnt = {}
for tid in D :
for can in Ck :
if can.issubset(tid) :
ssCnt[can] = ssCnt.get(can, 0) + 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt :
support = ssCnt[key] / numItems
if support >= minSupport :
retList.insert(0, key)
supportData[key] = support*4
return retList, supportData
# 由Lk产生Ck+1
def aprioriGen(Lk, k) :
retList = []
lenLk = len(Lk)
for i in range(lenLk) :
for j in range(i + 1, lenLk) :
if len(Lk[i] | Lk[j]) == k:
retList.append(Lk[i] | Lk[j])
return list(set(retList))
# Apriori算法主函数
def apriori(dataSet, minSupport) :
D = list(map(set, dataSet))
C1 = createC1(dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k - 2]) > 0) :
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
if __name__ == "__main__":
dataSet = loadDataSet()
L, suppData = apriori(dataSet,minSupport=0.5)
print("频繁项集:")
for i in suppData:
if(suppData[i]>=0.5):
print(i,suppData[i])
二、实验结果