简单优化的Apriori算法进行关联规则分析

实验的时候发现算法主要时间来自于C1 -> L1的构建,因此对L1的产生进行了优化,简单记录

def loadDataSet(filename):
    f= open(filename,'r')
    dataSet = []
    for line in f.readlines():
        l = line.split('{')[1][:-3].split(',')
        dataSet.append(l)
    return dataSet

def createL1(dataSet,minsupport = 0.5):
    C1 = {}
    C2 = []
    supportData = {}
    for transaction in dataSet:
        for item in transaction:
            C1[item] = C1.get(item,0) + 1
    l = float(len(dataSet))
    print('样本数量为'+str(l))
    p = l* minsupport
    for v in C1.items():
        supportData[frozenset([v[0]])] = float(v[1])/l
        if(v[1] >= p):
            C2.append([v[0]])

    return list(map(frozenset,C2)),supportData

def scanD(D,Ck,minsupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt: ssCnt[can] = 1
                else: ssCnt[can] += 1
    print('统计中')
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minsupport:
            retList.insert(0,key)
        supportData[key] = support

    return retList ,supportData

def aprioriGen(Lk,k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1,lenLk):
            L1 = list(Lk[i])[:k-2]; L2 =list(Lk[j])[:k-2]
            L1.sort();L2.sort()
            if L1 == L2:
                retList.append(Lk[i] | Lk[j])
    return retList

def apriori(dataSet,minSupport):
    D = list(map(set,dataSet))
    L1,supportData = createL1(dataSet,minSupport)
    print('L1,SupportData1 created')
    #L1,supportData = scanD(D,C1,minSupport)
    L = [L1]
    k = 2
    while(len(L[k-2])>0):
        Ck = aprioriGen(L[k-2],k)
        Lk ,supk = scanD(D,Ck,minSupport)
        supportData.update(supk)
        L.append(Lk)
        k+=1

    return L,supportData

def generateRules(L,supportData,minConf = 0.7):
    bigRuleList = []
    for i in range(1,len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if(i>1):
                rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
            else:
                calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    return bigRuleList

def calcConf(freqSet,H,supportData,brl,minconf = 0.7):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet- conseq]
        if(conf >= minconf):
            print(str(freqSet-conseq)+'--->'+str(conseq)+'   可信度为:'+str(conf))
            brl.append((freqSet-conseq,conseq,conf))
            prunedH.append(conseq)

    return prunedH

def rulesFromConseq(freqSet,H,supportData,brl,minConf = 0.7):
    m = len(H[0])
    #print(m)
    #print(freqSet)
    if(len(freqSet)>(m+1)):
        #print(Hmp1)
        Hmp1 = calcConf(freqSet,H,supportData,brl,minConf)
        Hmp1 = aprioriGen(Hmp1,m+1)
        if(len(Hmp1)>1):
            rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)


if __name__ == '__main__':
    filename = 'Groceries.csv'
    #dataSet = loadDataSet(filename)
    dataSet = [line.split() for line in open('kosarak.dat').readlines()]
    L,supportData = apriori(dataSet,0.05)
    rules = generateRules(L,supportData, 0.9)
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值