区别
- 我从百度百科找到了这个 :
- 如果不知道啥是关联规则可以看看这篇博客,我们主要来对比着看看序列模式。
- 目的:关联规则研究一次事件中的项目组合。序列模式则是求解事件之间的时间关系。也就是后者多了一个一个时间标记,我们将二维的数据展开就是关联规则的数据了,反正都是一个客户买的。
- 大家都说AprioriAll和Apriori的区别在于:AprioriAll候选集生成的时候需要区分最后两个元素的前后,因此就有<p.item1,p.item2,…,p.,q.>和<p.item1,p.item2,…, q.,p.>两个元素。为什么呢?
- 当我们将项集都排列在序列上,进入项集阶段,我们继续给项集排列,并通过Apriori删掉支持数不够的项集,得到项集与数字的映射表。
- 转换阶段:我们通过1中得到的映射表,映射到排列阶段得到的序列上。得到新的序列。
- 序列阶段:同样利用Apriori的算法去匹配。
- 代码如下,Apriori的库博客里有,我就不贴了。
import sys sys.path.append(r'../apriori') # python 原有apriori.py 文件故需要少些最后一个字母 import aprior import itertools import pandas as pd def createLs1(dataSet, min_support):# 'Ls' for Large Sequence n = len(dataSet) flattenSet = list(itertools.chain(*dataSet)) flatten_n = len(flattenSet) # Transform the min_support to litemset_support min_support_new = min_support * n /flatten_n litemsets = aprior.apriori(flattenSet, min_support=min_support_new) mapping = {v: k for k, v in enumerate(litemsets)} # Transform the litemset_support to sequence_support supportLs1 = {(mapping[k],):v * flatten_n / n for k, v in litemsets.items()} return mapping, supportLs1 def seqMapping(seq, mapping): newSeq = [] for iSet in seq: newSet = [v for k, v in mapping.items() if k <= set(iSet)] if newSet != []: newSeq.append(newSet) return newSeq def transform(dataSet, mapping): transformDS = [] for seq in dataSet: newSeq = seqMapping(seq, mapping) if newSeq != []: transformDS.append(newSeq) return transformDS def seqGen(seqA, seqB): newA, newB = seqA.copy(), seqB.copy() if seqA[:-1] == seqB[:-1]: newA.append(seqB[-1]) newB.append(seqA[-1]) return [newA, newB] def CsGen(Ls): Cs = [] for seqA, seqB in itertools.combinations(Ls, 2): newSeqs = seqGen(seqA, seqB) if newSeqs != None: Cs.extend(newSeqs) return [seq for seq in Cs if seq[1:] in Ls] # Pruning def isSubSeq(seq, cusSeq): nSeq, nCusSeq = len(seq), len(cusSeq) if nSeq > nCusSeq: return False if nSeq == 1: return any([seq[0] in i for i in cusSeq]) if nSeq > 1 : head = [seq[0] in i for i in cusSeq] if any(head): split = head.index(True) return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion else: return False def calcSupport(transformDS, Cs, min_support): ''' Return: a list of large-sequences a dictionary of `large-sequence: support` pairs ''' supportLsk = {}; n = len(transformDS) if len(Cs) >= 1: for seq in Cs: support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]) / n if support >= min_support: supportLsk.update({tuple(seq): support}) return [list(k) for k in supportLsk], supportLsk def isSubSeq2(seq, cusSeq): nSeq, nCusSeq = len(seq), len(cusSeq) if nSeq > nCusSeq: return False if nSeq == 1: return any([seq[0].issubset(i) for i in cusSeq]) if nSeq > 1 : head = [seq[0].issubset(i) for i in cusSeq] if any(head): split = head.index(True) return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion else: return False def notProperSubSeq(seq, cusSeq): if seq == cusSeq: return True else: return not isSubSeq2(seq, cusSeq) def maxLs(Ls, supportLs): LsCopy = Ls.copy() lenL, lenC = len(Ls), len(LsCopy) while lenC > 1 and lenL > 1: if LsCopy[lenC - 1] in Ls: mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls] Ls = list(itertools.compress(Ls, mask)) lenL = len(Ls) lenC -= 1 supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} # Dict comprehension return Ls, supportLs def aprioriAll(dataSet, min_support=0.25): ''' Proceeding aprioriall algorithm to mining sequential patterns Refer to: Agrawal,R.,Srikant,R.,Institute of Electric and Electronic Engineer et al. Mining sequential patterns[C]. Proceedings of the Eleventh International Conference on Data Engineering, Washington DC, USA: IEEE Computer Society,1995:3-14. ''' mapping, supportLs1 = createLs1(dataSet, min_support) Ls1 = [list(k) for k in supportLs1] transformDS = transform(dataSet, mapping) LsList = [Ls1]; supportLs = supportLs1.copy() k = 1 while k >= 1 and len(LsList[-1]) > 1: Csk = CsGen(LsList[-1]) Lsk, supportLsk = calcSupport(transformDS, Csk, min_support) if len(Lsk) > 0: LsList.append(Lsk); supportLs.update(supportLsk) k += 1 else: break Ls = list(itertools.chain(*LsList)) tr_mapping = {v: k for k, v in mapping.items()} Ls = [[tr_mapping[k] for k in seq] for seq in Ls ] supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in supportLs.items()} Ls, supportLs = maxLs(Ls, supportLs) return pd.DataFrame(list(supportLs.items()), columns=['sequence', 'support']) # 利用迭代器将数据转化为列表的形式 def aggFunc(*args): agg = itertools.chain(*args) return list(agg) if __name__ == '__main__': transactions = pd.read_csv("../apriori/Transactions.csv") """ 这里调用groupby()函数将Model下的变量根据 OrderNumber(第一关键字),LineNumber(第二关键字), 进行排序 """ baskets = transactions['Model']\ .groupby([transactions['OrderNumber'], transactions['LineNumber']])\ .apply(aggFunc) dataSet = list(baskets.groupby(level=0).apply(list)) # seq1 = [ [30], [90] ] # seq2 = [ [10, 20], [30], [40, 60, 70] ] # seq3 = [ [30, 50, 70], ] # seq4 = [ [30], [40, 70], [90] ] # seq5 = [ [90], ] # dataSet = [seq1, seq2, seq3, seq4, seq5] print(aprioriAll(dataSet, min_support=0.05)) print ('This script should be imported instead of running directly!') else: print ('aprioriAll imported!')
最后推荐一篇写AprioriAll和AprioriSome算法的博客:https://blog.csdn.net/weeyang/article/details/52793864