数据科学案例14 推荐算法Apriori案例

最新推荐文章于 2023-04-20 09:23:42 发布

irober

最新推荐文章于 2023-04-20 09:23:42 发布

阅读量516

点赞数

分类专栏： # 数据科学案例篇 python数据挖掘文章标签： python 算法数据挖掘

本文链接：https://blog.csdn.net/irober/article/details/104894815

版权

python数据挖掘同时被 2 个专栏收录

34 篇文章 1 订阅

订阅专栏

数据科学案例篇

13 篇文章 6 订阅

订阅专栏

数据科学案例14 推荐算法Apriori案例

16 推荐算法案例

16 推荐算法案例

16.1 Apriori包编写

seq1 = [           [30], [90]          ]
seq2 = [ [10, 20], [30], [40, 60, 70]  ]
seq3 = [         [30, 50, 70],         ]
seq4 = [      [30], [40, 70], [90]     ]
seq5 = [              [90],            ]
dataSet = [seq1, seq2, seq3, seq4, seq5]
min_support=0.25

1、Sort Phase

过程略，直接使用整理过的示例数据

2、Litemset Phase

搜索litemset，直接使用apriori算法，主要的区别在于计算支持度时，一个客户customer购买了同样的项集（itemset）时，支持频度仅计算一次。这是因为在apriori算法中，支持度是对交易(transaction)而言的，但在序列模式的计算中，大项集的支持度是对客户(customer)而言的

import os
os.chdir(r"H:\假期\myGitHub\Machine_Learning\练习\数据科学")
import sys
sys.path.append('./myscripts')

import itertools
import pandas as pd
from apriori import apriori

apriori imported!

def createLs1(dataSet, min_support):
    '''
    Using  algorithm apriorito mining large 1-sequences (1-项集)
    `Ls` for Large Sequence
    '''
    n = len(dataSet)
    flattenSet = list(itertools.chain(*dataSet))  #itertools.chain
#     print("flattenSet:"+ str(flattenSet))
    flatten_n = len(flattenSet)
#     print("flatten_n:"+ str(flatten_n))
    
    # Transform the min_support to litemset_support
    min_support_new = min_support * n /flatten_n
#     print("min_support_new:"+ str(min_support_new))
    litemsets = apriori(flattenSet, min_support=min_support_new)
#     print("litemsets:"+ str(litemsets))
    mapping = {v: k for k, v in enumerate(litemsets)}
    
    # Transform the litemset_support to sequence_support
    supportLs1 = {(mapping[k],): v *flatten_n / n
                     for k, v in litemsets.items()}
    return mapping, supportLs1

#测试
mapping, supportLs1 = createLs1(dataSet, min_support=min_support)
mapping

flattenSet:[[30], [90], [10, 20], [30], [40, 60, 70], [30, 50, 70], [30], [40, 70], [90], [90]]
flatten_n:10
min_support_new:0.125
litemsets:{frozenset({30}): 0.4, frozenset({90}): 0.3, frozenset({70}): 0.3, frozenset({40}): 0.2, frozenset({40, 70}): 0.2}

{frozenset({30}): 0,
 frozenset({90}): 1,
 frozenset({70}): 2,
 frozenset({40}): 3,
 frozenset({40, 70}): 4}

supportLs1

{(0,): 0.8, (1,): 0.6, (2,): 0.6, (3,): 0.4, (4,): 0.4}

Ls1 = [list(k) for k in supportLs1]
Ls1

[[0], [1], [2], [3], [4]]

3、Transformation Phase

def seqMapping(seq, mapping):
    '''
    Mapping litemsets to integer objects, for treating litemsets as
    single entities, and reducing the time required 
    '''
    newSeq = []
    for iSet in seq:
        newSet = [v for k, v in mapping.items() if k <= set(iSet)]
        if newSet != []:
            newSeq.append(newSet)
            
    return newSeq

def transform(dataSet, mapping):
    '''
    Transform each customer sequence into an alternative representation.
    '''
    transformDS = []
    for seq in dataSet:
        newSeq = seqMapping(seq, mapping)
        if newSeq != []:
            transformDS.append(newSeq)
    return transformDS

# - 测试
transformDS  = transform(dataSet, mapping)
for seq in transformDS :
    print(seq)

[[0], [1]]
[[0], [2, 3, 4]]
[[0, 2]]
[[0], [2, 3, 4], [1]]
[[1]]

4、Sequence Phase

# 产生候选序列
def seqGen(seqA, seqB):
    '''
    Generate candidate k+1 sequences with two large k-sequences
    '''
    newA, newB = seqA.copy(), seqB.copy()
    if seqA[:-1] == seqB[:-1]: #比较除最后一个元素外的所有元素
        newA.append(seqB[-1])
        newB.append(seqA[-1])
        return [newA, newB]

def CsGen(Ls):
    '''
    Generate all candidate k+1 sequences from large k-sequences
    '''
    Cs = []
    for seqA, seqB in itertools.combinations(Ls, 2):#返回iterable中所有长度为2的子序列，返回的子序列中的项按输入iterable中的顺序排序
#         print(str(seqA)+'--'+str(seqB))
        newSeqs = seqGen(seqA, seqB)
#         print(newSeqs)
        if newSeqs != None:
            Cs.extend(newSeqs)
#             print("CS:"+str(Cs))
    return [seq for seq in Cs if seq[1:] in Ls] #  Pruning

# - 测试

testLs = [
    [1, 2, 3], 
    [1, 2, 4],
    [1, 3, 4],
    [1, 3, 5],
    [2, 3, 4]]
CsGen(testLs)

[1, 2, 3]--[1, 2, 4]
[[1, 2, 3, 4], [1, 2, 4, 3]]
CS:[[1, 2, 3, 4], [1, 2, 4, 3]]
[1, 2, 3]--[1, 3, 4]
None
[1, 2, 3]--[1, 3, 5]
None
[1, 2, 3]--[2, 3, 4]
None
[1, 2, 4]--[1, 3, 4]
None
[1, 2, 4]--[1, 3, 5]
None
[1, 2, 4]--[2, 3, 4]
None
[1, 3, 4]--[1, 3, 5]
[[1, 3, 4, 5], [1, 3, 5, 4]]
CS:[[1, 2, 3, 4], [1, 2, 4, 3], [1, 3, 4, 5], [1, 3, 5, 4]]
[1, 3, 4]--[2, 3, 4]
None
[1, 3, 5]--[2, 3, 4]
None

[[1, 2, 3, 4]]

# 测试
for seqA, seqB in itertools.combinations(testLs, 2):
#     print(seqA)
    print(str(seqA)+'--'+str(seqB))

[1, 2, 3]--[1, 2, 4]
[1, 2, 3]--[1, 3, 4]
[1, 2, 3]--[1, 3, 5]
[1, 2, 3]--[2, 3, 4]
[1, 2, 4]--[1, 3, 4]
[1, 2, 4]--[1, 3, 5]
[1, 2, 4]--[2, 3, 4]
[1, 3, 4]--[1, 3, 5]
[1, 3, 4]--[2, 3, 4]
[1, 3, 5]--[2, 3, 4]

seq = [3, 4, 8]
seq[1:]

[4, 8]

# 子序列判断

def isSubSeq(seq, cusSeq):
    '''
    Check if a sequence is contained in a customer sequence.
    '''
    nSeq, nCusSeq = len(seq), len(cusSeq)
    if nSeq > nCusSeq:
        return False 
    if nSeq == 1:        
        return any([seq[0] in i for i in cusSeq])
    if nSeq > 1 :
        head = [seq[0] in i for i in cusSeq]
        if any(head):
            split = head.index(True)
            return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion
        else:
            return False

# - 测试

seq = [3, 4, 8]
cusSeq = [[7], [3, 8], [9], [4, 5, 6], [8]]
isSubSeq(seq, cusSeq)

True

# 产生频繁k序列，此步骤需要迭代执行

def calcSupport(transformDS, Cs, min_support):
    '''
    Return: 1. a list of large-sequences
            2. a dictionary of `large-sequence: support` pairs
    '''
    supportLsk = {}; n = len(transformDS)
    if len(Cs) >= 1:
        for seq in Cs:
            support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]                         ) / n
            if support >= min_support:
                supportLsk.update({tuple(seq): support})
    return [list(k) for k in supportLsk], supportLsk

# - 测试

Cs2 = CsGen(Ls1) #LS1：[[0], [1], [2], [3], [4]]
Ls2, supportLs2 = calcSupport(transformDS, Cs2, min_support)
# print(Ls2)
# print(supportLs2)

    


```python
print(Ls2)
print(supportLs2)

[[0, 1], [0, 2], [0, 3], [0, 4]]
{(0, 1): 0.4, (0, 2): 0.4, (0, 3): 0.4, (0, 4): 0.4}

5、Maximal Phase

一个更快速搜寻子序列的算法可参考：
R. Agrawal and R. Srikant. Mining sequential patterns. Research Report RJ 9910, IBM Almaden Research Center, San Jose, California, Oc
tober 1994.

需要将大序列中的项集转换回原始的购物篮再进行序列最大化

tr_mapping = {v: k for k, v in mapping.items()}
Ls = Ls1 + Ls2
Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]

supportLs = {}
supportLs.update(supportLs1); supportLs.update(supportLs2)
supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in 
             supportLs.items()}

print(supportLs)

{(frozenset({0}),): 0.8, (frozenset({1}),): 0.6, (frozenset({2}),): 0.6, (frozenset({3}),): 0.4, (frozenset({4}),): 0.4, (frozenset({3, 4}),): 0.4, (frozenset({2, 3}),): 0.4, (frozenset({2, 4}),): 0.4, (frozenset({2, 3, 4}),): 0.4, (frozenset({0}), frozenset({1})): 0.4, (frozenset({0}), frozenset({2})): 0.4, (frozenset({0}), frozenset({3})): 0.4, (frozenset({0}), frozenset({4})): 0.4}

# 序列最大化需要保留的是某个序列的非空真子序列
# （类似于非空真子集，此处要保留该序列本身），
# 该步骤与Transformation阶段中判断子序列的方法类似，
# 区别在于已经将其中的项集映射回来了，因此稍作修改


def isSubSeq2(seq, cusSeq):
    nSeq, nCusSeq = len(seq), len(cusSeq)
    
    if nSeq > nCusSeq:
        return False 
    if nSeq == 1:        
        return any([seq[0].issubset(i) for i in cusSeq])
    if nSeq > 1 :
        head = [seq[0].issubset(i) for i in cusSeq]
        if any(head):
            split = head.index(True)
            return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion
        else:
            return False           

def notProperSubSeq(seq, cusSeq):
    '''
    Return True if `seq` is not proper sub sequence of `cusSeq`
    '''
    if seq == cusSeq:
        return True
    else:
        return not isSubSeq2(seq, cusSeq)

# 将备选序列中的最大化的序列保留下来

def maxLs(Ls, supportLs):
    LsCopy = Ls.copy()
    lenL, lenC = len(Ls), len(LsCopy)
    while lenC > 1 and lenL > 1:
        if LsCopy[lenC - 1] in Ls:
            mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]
            Ls = list(itertools.compress(Ls, mask))
            lenL = len(Ls)
            
        lenC -= 1
        
    supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} 
    return Ls, supportLs

Ls, supportLs = maxLs(Ls, supportLs)
supportLs

{(frozenset({0}), frozenset({1})): 0.4,
 (frozenset({0}), frozenset({2})): 0.4,
 (frozenset({0}), frozenset({3})): 0.4,
 (frozenset({0}), frozenset({4})): 0.4}

6、aprioriAll

# ### aprioriAll


def aprioriAll(dataSet, min_support=0.4):
    '''
    Proceeding aprioriall algorithm to mining sequential patterns
    
    Refer to:    
    Agrawal,R.,Srikant,R.,Institute of Electric and Electronic 
    Engineer et al. Mining sequential patterns[C]. Proceedings 
    of the Eleventh International Conference on Data Engineering,
    Washington DC, USA: IEEE Computer Society,1995:3-14.
    '''
    # Litemset Phase
    mapping, supportLs1 = createLs1(dataSet, min_support)
    Ls1 = [list(k) for k in supportLs1]
    
    # Transformation Phase
    transformDS  = transform(dataSet, mapping)
    
    # Sequence Phase
    LsList = [Ls1]; supportLs = supportLs1.copy()
    k = 1
    while k >= 1 and len(LsList[-1]) > 1:
        Csk = CsGen(LsList[-1])
        Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)
        if len(Lsk) > 0:
            LsList.append(Lsk); supportLs.update(supportLsk)
            k += 1
        else:
            break
            
    Ls = list(itertools.chain(*LsList))
    tr_mapping = {v: k for k, v in mapping.items()}
    Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]
    supportLs = {tuple([tr_mapping[i] for i in k]):v 
                 for k, v in supportLs.items()}
    
    # Maximal Phase
    Ls, supportLs = maxLs(Ls, supportLs)
       
    return pd.DataFrame(list(supportLs.items()), 
                        columns=['sequence', 'support'])

aprioriAll(dataSet, min_support=0.25)

	sequence	support
0	((30), (90))	0.4
1	((30), (40, 70))	0.4

16.1 调用现成Apriori包

testSet = [
    [[1, 5], [2], [3], [4] ],
    [[1], [3], [4], [3, 5] ],
    [[1], [2], [3], [4]    ],
    [[1], [3], [5]         ],
    [[4], [5]              ]
    ]

import sys
sys.path.append('E:/myscripts')
from aprioriAll import aprioriAll

aprioriAll imported!

aprioriAll(testSet, min_support=0.4)

	sequence	support
0	((4), (5))	0.4
1	((1), (3), (5))	0.4
2	((1), (2), (3), (4))	0.4

import os, sys
import itertools
import pandas as pd

sys.path.append('./myscripts')

from aprioriAll import aprioriAll

transactions = pd.read_csv(r'.\data\Transactions.csv')

def aggFunc(*args):
    agg = itertools.chain(*args)
    return list(agg)

baskets = transactions['Model'].groupby([transactions['OrderNumber'], transactions['LineNumber']]).apply(aggFunc)
baskets.head()

OrderNumber  LineNumber
SO51176      1                     [Road-250]
             2             [Road Bottle Cage]
SO51177      1                 [Touring-2000]
             2                    [Sport-100]
SO51178      1                 [Mountain-200]
Name: Model, dtype: object

dataSet = list(baskets.groupby(level=0).apply(list))
dataSet[:3]

[[['Road-250'], ['Road Bottle Cage']],
 [['Touring-2000'], ['Sport-100']],
 [['Mountain-200'], ['Mountain Bottle Cage'], ['Water Bottle']]]

seq=aprioriAll(dataSet, min_support=0.04)
seq.head()

	sequence	support
0	((Mountain-200),)	0.116537
1	((Road Tire Tube),)	0.104258
2	((Long-Sleeve Logo Jersey),)	0.077252
3	((Road-350-W),)	0.043707
4	((HL Mountain Tire),)	0.062621

irober

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
数据科学案例14 推荐算法Apriori案例

数据科学案例14 推荐算法Apriori案例16 推荐算法案例16.1 Apriori包编写1、Sort Phase2、Litemset Phase3、Transformation Phase4、Sequence Phase5、Maximal Phase6、aprioriAll16.1 调用现成Apriori包16 推荐算法案例16.1 Apriori包编写seq1 = [ ...
复制链接

扫一扫