关联规则算法学习—Apriori
一、实验项目:关联规则算法学习
项目性质:设计型二、实验目的: 理解并掌握关联规则经典算法Apriori算法,理解算法的原理,能够实现算法,并对给定的数据集进行关联规则挖掘
三、实验内容:
1、实现Apriori算法,验证算法的正确性,并将算法应用于给定的数据集Groceries,根据设定的支持度和置信度,挖掘出符合条件的频繁项集及关联规则。
2、挑选几个有代表性的频繁项集和关联规则,记录下来。 3、调节支持度和置信度阈值,重新执行算法,比较结果的不同。
# coding=utf-8
import pandas as pd
import numpy as np
def getDataSet():
DataSet = pd.read_csv('Groceries.csv', encoding='UTF-8')
dataSet = np.array(DataSet).tolist()
columns = np.array(DataSet.columns).tolist()
data = []
for d in dataSet:
d[1] = str(d[1]).split(' ') # str(d[1]).split(' ') 元素集转化为单个
data.append(d[1])
return data, columns
def createItems(dataSet):
Items = []
for d in dataSet:
for item in d:
if not [item] in Items: # list 用 not[item] in list
Items.append([item])
Items.sort()
return map(frozenset, Items)
def createSupportItem(D, Items, MinSupport):
X = {}
dataSet = list(D)
items = list(Items)
sumItem = float(len(dataSet))
for d in dataSet:
for item in items: # 候选集
if item.issubset(d): # 候选集为item子集
if not item in X:
X[item] = 1 # 不存在就创建,存在就加一
else:
X[item] += 1
supportItems = []
supportData = {}
for k in X.keys():
support = X[k] / float(sumItem) # 支持度
if support >= MinSupport:
supportItems.insert(0, k)
supportData[k] = support
return supportItems, supportData
def AprioriConf(Lk, k): # 计算K频繁项集
# Lk 是上一个频繁项集 last
# k是创建的项集数
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i + 1, lenLk):
L1 = list(Lk[i])[:k - 2]
L2 = list(Lk[j])[:k - 2]
L1.sort()
L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
def Apriori(dataSet, minSupport):
Items = createItems(dataSet)
D = map(set, dataSet)
L1, supportData = createSupportItem(D, Items, minSupport)
L = [L1]
k = 2
while (len(L[k - 2]) > 0):
Ck = AprioriConf(L[k - 2], k)
Lk, Supk = createSupportItem(map(set, dataSet), Ck, MinSupport=minSupport)
supportData.update(Supk)
L.append(Lk)
k += 1
return L, supportData
def main():
dataSet, columns = getDataSet() # dataSet中仅有项目集,没有订单集
L, Support = Apriori(dataSet, 0.5)
print('所有频繁项集L:')
for l in L:
print(l)
print('对应支持度Support:')
for k, v in Support.items():
print('项目集:', k, '的支持度为:', v)
if __name__ == '__main__':
main()
运行结果: