python序列模式的关联算法_Python数据分析序列模式与关联规则的区别,和

该博客介绍了如何使用Python实现Apriori算法来挖掘序列模式。首先通过`createLs1`函数创建大项集并计算支持度,然后利用`seqMapping`和`transform`函数转换数据集。接着,通过`CsGen`生成候选项集,`calcSupport`计算支持度并过滤序列。最后,`aprioriAll`函数整合整个过程,对`Transactions.csv`数据进行挖掘,并展示结果。
摘要由CSDN通过智能技术生成

import sys

sys.path.append(r'../apriori')

# python 原有apriori.py 文件故需要少些最后一个字母

import aprior

import itertools

import pandas as pd

def createLs1(dataSet, min_support):# 'Ls' for Large Sequence

n = len(dataSet)

flattenSet = list(itertools.chain(*dataSet))

flatten_n = len(flattenSet)

# Transform the min_support to litemset_support

min_support_new = min_support * n /flatten_n

litemsets = aprior.apriori(flattenSet, min_support=min_support_new)

mapping = {v: k for k, v in enumerate(litemsets)}

# Transform the litemset_support to sequence_support

supportLs1 = {(mapping[k],):v * flatten_n / n for k, v in litemsets.items()}

return mapping, supportLs1

def seqMapping(seq, mapping):

newSeq = []

for iSet in seq:

newSet = [v for k, v in mapping.items() if k <= set(iSet)]

if newSet != []:

newSeq.append(newSet)

return newSeq

def transform(dataSet, mapping):

transformDS = []

for seq in dataSet:

newSeq = seqMapping(seq, mapping)

if newSeq != []:

transformDS.append(newSeq)

return transformDS

def seqGen(seqA, seqB):

newA, newB = seqA.copy(), seqB.copy()

if seqA[:-1] == seqB[:-1]:

newA.append(seqB[-1])

newB.append(seqA[-1])

return [newA, newB]

def CsGen(Ls):

Cs = []

for seqA, seqB in itertools.combinations(Ls, 2):

newSeqs = seqGen(seqA, seqB)

if newSeqs != None:

Cs.extend(newSeqs)

return [seq for seq in Cs if seq[1:] in Ls] # Pruning

def isSubSeq(seq, cusSeq):

nSeq, nCusSeq = len(seq), len(cusSeq)

if nSeq > nCusSeq:

return False

if nSeq == 1:

return any([seq[0] in i for i in cusSeq])

if nSeq > 1 :

head = [seq[0] in i for i in cusSeq]

if any(head):

split = head.index(True)

return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion

else:

return False

def calcSupport(transformDS, Cs, min_support):

'''

Return: a list of large-sequences

a dictionary of `large-sequence: support` pairs

'''

supportLsk = {}; n = len(transformDS)

if len(Cs) >= 1:

for seq in Cs:

support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]) / n

if support >= min_support:

supportLsk.update({tuple(seq): support})

return [list(k) for k in supportLsk], supportLsk

def isSubSeq2(seq, cusSeq):

nSeq, nCusSeq = len(seq), len(cusSeq)

if nSeq > nCusSeq:

return False

if nSeq == 1:

return any([seq[0].issubset(i) for i in cusSeq])

if nSeq > 1 :

head = [seq[0].issubset(i) for i in cusSeq]

if any(head):

split = head.index(True)

return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion

else:

return False

def notProperSubSeq(seq, cusSeq):

if seq == cusSeq:

return True

else:

return not isSubSeq2(seq, cusSeq)

def maxLs(Ls, supportLs):

LsCopy = Ls.copy()

lenL, lenC = len(Ls), len(LsCopy)

while lenC > 1 and lenL > 1:

if LsCopy[lenC - 1] in Ls:

mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]

Ls = list(itertools.compress(Ls, mask))

lenL = len(Ls)

lenC -= 1

supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} # Dict comprehension

return Ls, supportLs

def aprioriAll(dataSet, min_support=0.25):

'''

Proceeding aprioriall algorithm to mining sequential patterns

Refer to:

Agrawal,R.,Srikant,R.,Institute of Electric and Electronic

Engineer et al. Mining sequential patterns[C]. Proceedings

of the Eleventh International Conference on Data Engineering,

Washington DC, USA: IEEE Computer Society,1995:3-14.

'''

mapping, supportLs1 = createLs1(dataSet, min_support)

Ls1 = [list(k) for k in supportLs1]

transformDS = transform(dataSet, mapping)

LsList = [Ls1]; supportLs = supportLs1.copy()

k = 1

while k >= 1 and len(LsList[-1]) > 1:

Csk = CsGen(LsList[-1])

Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)

if len(Lsk) > 0:

LsList.append(Lsk); supportLs.update(supportLsk)

k += 1

else:

break

Ls = list(itertools.chain(*LsList))

tr_mapping = {v: k for k, v in mapping.items()}

Ls = [[tr_mapping[k] for k in seq] for seq in Ls ]

supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in supportLs.items()}

Ls, supportLs = maxLs(Ls, supportLs)

return pd.DataFrame(list(supportLs.items()), columns=['sequence', 'support'])

# 利用迭代器将数据转化为列表的形式

def aggFunc(*args):

agg = itertools.chain(*args)

return list(agg)

if __name__ == '__main__':

transactions = pd.read_csv("../apriori/Transactions.csv")

"""

这里调用groupby()函数将Model下的变量根据

OrderNumber(第一关键字),LineNumber(第二关键字),

进行排序

"""

baskets = transactions['Model']\

.groupby([transactions['OrderNumber'], transactions['LineNumber']])\

.apply(aggFunc)

dataSet = list(baskets.groupby(level=0).apply(list))

# seq1 = [ [30], [90] ]

# seq2 = [ [10, 20], [30], [40, 60, 70] ]

# seq3 = [ [30, 50, 70], ]

# seq4 = [ [30], [40, 70], [90] ]

# seq5 = [ [90], ]

# dataSet = [seq1, seq2, seq3, seq4, seq5]

print(aprioriAll(dataSet, min_support=0.05))

print ('This script should be imported instead of running directly!')

else:

print ('aprioriAll imported!')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值