Apriori算法Python3实现

“啤酒与尿布”的故事:在一家超市中,人们发现了一个特别有趣的现象,啤酒与尿布这两种风马牛不相及的商品居然摆在一起。但这一奇怪的举措竟然使尿布和啤酒的销量大幅增加了。原来妻子嘱咐丈夫给孩子买尿布时,丈夫在买完尿布后通常会买自己喜欢的啤酒。这个发现为商家带来了大量的利润,但是如何从浩如烟海却又杂乱无章的数据中,发现啤酒和尿布之间的联系呢?Apriori算法可以解决。

# -*- coding:utf-8 -*-

from numpy import *

#创建测试数据
def loadData():
    # return [['豆奶','莴苣'],
    #  ['莴苣','尿布','葡萄酒','甜菜'],
    #  ['豆奶','尿布','葡萄酒','橙汁'],
    #  ['莴苣','豆奶','尿布','葡萄酒'],
    #  ['莴苣','豆奶','尿布','橙汁']]
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

#创建大小为1的所有数据集的集合,即将所有的单项放在一个list中
def createC1(dataSet):
    C1 = []
    for data in dataSet:
        for i in data:
            if [i] not in C1:
                C1.append([i])
    return list(map(frozenset,C1))

# 计算单元素支持度,即看数据中包含此元素的数据占总数据的比例
def degreeSupport(dataSet,C1,minSupport=0.5):
    degreeSupportList = {}  #返回支持度字典
    dataList = [] #返回满足最小支持度的元素列表
    itemCount = {}   #C1中元素在dataSet中出现的次数
    for data in dataSet:
        for item in C1:
            if item.issubset(data):  #判断item是否包含在data中
                if not item in itemCount:
                    itemCount[item] = 0
                itemCount[item] += 1
    n = float(len(dataSet))
    for key in itemCount.keys():
        support = itemCount[key]/n   #计算支持度
        if support >= minSupport:
            dataList.append(key)
        degreeSupportList[key] = support
    return dataList,degreeSupportList

# 数据合并
# 如果数据长度为n,取前n-1个元素相同的两个数据合并组成新的数据,这样不必计算所有的组合,可以省略剪枝步骤
def merge(L,k):
    n = len(L)
    dataMergeList = []
    for i in range(n-1):
        for j in range(i+1,n):
            L1 = list(L[i])[:k]
            L2 = list(L[j])[:k]
            L1.sort()
            L2.sort()
            if L1 == L2:
                dataMergeList.append(L[i] | L[j])
    return dataMergeList

#
def apriori(dataSet,minSupport=0.5):
    dataSet = list(map(set,dataSet))    #数据集合中的数据去重[[1,1,2]] --> [[1,2]]
    C1 = createC1(dataSet)
    L1, degreeSupportList = degreeSupport(dataSet, C1, minSupport)
    L = [L1]
    k = 0
    while len(L[k]) > 1:  #创建包含更大项集的更大列表,直到下一个大的项集为1
        Lm = merge(L[k],k)
        Lk,support = degreeSupport(dataSet,Lm,minSupport)
        L.append(Lk)
        degreeSupportList.update(support)
        k += 1
    return L,degreeSupportList


def confCalculation(L,data,degreeSupportList,minConf,bigRulesList):
    n = len(data)
    for i in range(n-1):   #取n = len(data)的目的是取到L中数据长度为n-1即可,否则后面的长度大于或者等于data的长度,再计算没有意义了
        for d in L[i]:
            if d.issubset(data):
                conf = degreeSupportList[data]/degreeSupportList[data-d]
                if conf > minConf:
                    print(data-d,'-->',d,'conf:',conf)
                    bigRulesList.append((data-d,d,conf))

#创建关联规则
def genetateRules(L,degreeSupportList,minConf = 0.5):
    bigRulesList = []
    n = len(L)
    for i in range(1,n):
        for j in L[i]:
            confCalculation(L, j, degreeSupportList, minConf, bigRulesList)
    return bigRulesList

dataSet = loadData()
# print(dataSet)
# C1 = createC1(dataSet)
# print(C1)
# L1,degreeSupportList = degreeSupport(dataSet,C1,0.5)
# print(L1)
# print(degreeSupportList)
L,degreeSupportList = apriori(dataSet,0.5)
print(L)
print(degreeSupportList)
rulesList = genetateRules(L,degreeSupportList,0.7)
print(rulesList)



评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值