Python_数据分析_序列模式和关联规则区别

区别

  1. 我从百度百科找到了这个                                                               :
  2. 如果不知道啥是关联规则可以看看这篇博客,我们主要来对比着看看序列模式。
    1. 目的:关联规则研究一次事件中的项目组合。序列模式则是求解事件之间的时间关系。也就是后者多了一个一个时间标记,我们将二维的数据展开就是关联规则的数据了,反正都是一个客户买的。
    2. 大家都说AprioriAll和Apriori的区别在于:AprioriAll候选集生成的时候需要区分最后两个元素的前后,因此就有<p.item1,p.item2,…,p.,q.>和<p.item1,p.item2,…, q.,p.>两个元素。为什么呢?
      1. 当我们将项集都排列在序列上,进入项集阶段,我们继续给项集排列,并通过Apriori删掉支持数不够的项集,得到项集与数字的映射表。
      2. 转换阶段:我们通过1中得到的映射表,映射到排列阶段得到的序列上。得到新的序列。
      3. 序列阶段:同样利用Apriori的算法去匹配。
  3. 代码如下,Apriori的库博客里有,我就不贴了。
    import sys
    sys.path.append(r'../apriori')
    # python 原有apriori.py 文件故需要少些最后一个字母
    import aprior
    import itertools
    import pandas as pd
    
    
    def createLs1(dataSet, min_support):# 'Ls' for Large Sequence
        n = len(dataSet)
        flattenSet = list(itertools.chain(*dataSet))
        flatten_n = len(flattenSet)
        
        # Transform the min_support to litemset_support
        min_support_new = min_support * n /flatten_n
        litemsets = aprior.apriori(flattenSet, min_support=min_support_new)
            
        mapping = {v: k for k, v in enumerate(litemsets)}
        # Transform the litemset_support to sequence_support
        supportLs1 = {(mapping[k],):v * flatten_n / n for k, v in litemsets.items()}
        
        return mapping, supportLs1
    	
    def seqMapping(seq, mapping):
        newSeq = []
        for iSet in seq:
            newSet = [v for k, v in mapping.items() if k <= set(iSet)]
            if newSet != []:
                newSeq.append(newSet)
        return newSeq
    
    def transform(dataSet, mapping):
        
        transformDS = []
        for seq in dataSet:
            newSeq = seqMapping(seq, mapping)
            if newSeq != []:
                transformDS.append(newSeq)
        return transformDS                    
    	
    def seqGen(seqA, seqB):
        
        newA, newB = seqA.copy(), seqB.copy()
        if seqA[:-1] == seqB[:-1]:
            newA.append(seqB[-1])
            newB.append(seqA[-1])
            return [newA, newB]
    
    def CsGen(Ls):
        Cs = []
        for seqA, seqB in itertools.combinations(Ls, 2):
            newSeqs = seqGen(seqA, seqB)
            if newSeqs != None:
                Cs.extend(newSeqs)
        return [seq for seq in Cs if seq[1:] in Ls] #  Pruning  
    	
    
    def isSubSeq(seq, cusSeq):
        nSeq, nCusSeq = len(seq), len(cusSeq)
        
        if nSeq > nCusSeq:
            return False 
        
        if nSeq == 1:        
            return any([seq[0] in i for i in cusSeq])
        
        if nSeq > 1 :
            head = [seq[0] in i for i in cusSeq]
            if any(head):
                split = head.index(True)
                return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion
            else:
                return False
    
    			
    def calcSupport(transformDS, Cs, min_support):
        '''
        Return: a list of large-sequences
                a dictionary of `large-sequence: support` pairs
        '''
        supportLsk = {}; n = len(transformDS)
        if len(Cs) >= 1:
            for seq in Cs:
                support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]) / n
                if support >= min_support:
                    supportLsk.update({tuple(seq): support})
        return [list(k) for k in supportLsk], supportLsk       
    	
    	
    	
    def isSubSeq2(seq, cusSeq):
        nSeq, nCusSeq = len(seq), len(cusSeq)
        
        if nSeq > nCusSeq:
            return False 
        
        if nSeq == 1:        
            return any([seq[0].issubset(i) for i in cusSeq])
        
        if nSeq > 1 :
            head = [seq[0].issubset(i) for i in cusSeq]
            if any(head):
                split = head.index(True)
                return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion
            else:
                return False           
    
    def notProperSubSeq(seq, cusSeq):
    
        if seq == cusSeq:
            return True
        else:
            return not isSubSeq2(seq, cusSeq)
    		
    
    				
    def maxLs(Ls, supportLs):
        LsCopy = Ls.copy()
        lenL, lenC = len(Ls), len(LsCopy)
    
        while lenC > 1 and lenL > 1:
            if LsCopy[lenC - 1] in Ls:
                mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]
                Ls = list(itertools.compress(Ls, mask))
                lenL = len(Ls)
                
            lenC -= 1
        supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} # Dict comprehension
        return Ls, supportLs
    	
    	
    	
    def aprioriAll(dataSet, min_support=0.25):
        '''
        Proceeding aprioriall algorithm to mining sequential patterns
        
        Refer to:    
        Agrawal,R.,Srikant,R.,Institute of Electric and Electronic 
        Engineer et al. Mining sequential patterns[C]. Proceedings 
        of the Eleventh International Conference on Data Engineering,
        Washington DC, USA: IEEE Computer Society,1995:3-14.
        '''
        mapping, supportLs1 = createLs1(dataSet, min_support)
        
        Ls1 = [list(k) for k in supportLs1]
        transformDS  = transform(dataSet, mapping)
        
        LsList = [Ls1]; supportLs = supportLs1.copy()
        k = 1
        while k >= 1 and len(LsList[-1]) > 1:
            Csk = CsGen(LsList[-1])
            Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)
            if len(Lsk) > 0:
                LsList.append(Lsk); supportLs.update(supportLsk)
                k += 1
            else:
                break
        Ls = list(itertools.chain(*LsList))
        
        tr_mapping = {v: k for k, v in mapping.items()}
        Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]
        supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in supportLs.items()}
        
        Ls, supportLs = maxLs(Ls, supportLs)
           
        return pd.DataFrame(list(supportLs.items()), columns=['sequence', 'support'])
    	
    	# 利用迭代器将数据转化为列表的形式 
    
    def aggFunc(*args):
        agg = itertools.chain(*args)
        return list(agg)
    	
    if __name__ == '__main__':
        transactions = pd.read_csv("../apriori/Transactions.csv")
        
        """
        这里调用groupby()函数将Model下的变量根据
        OrderNumber(第一关键字),LineNumber(第二关键字),
        进行排序
        """
        baskets = transactions['Model']\
        .groupby([transactions['OrderNumber'], transactions['LineNumber']])\
        .apply(aggFunc)
        dataSet = list(baskets.groupby(level=0).apply(list))
        # seq1 = [           [30], [90]          ]
        # seq2 = [ [10, 20], [30], [40, 60, 70]  ]
        # seq3 = [         [30, 50, 70],         ]
        # seq4 = [      [30], [40, 70], [90]     ]
        # seq5 = [              [90],            ]
        # dataSet = [seq1, seq2, seq3, seq4, seq5]
        print(aprioriAll(dataSet, min_support=0.05))
    
    
    
        print ('This script should be imported instead of running directly!')
    else:
        print ('aprioriAll imported!')

    最后推荐一篇写AprioriAll和AprioriSome算法的博客:https://blog.csdn.net/weeyang/article/details/52793864

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值