数据科学 案例14 推荐算法Apriori案例

16 推荐算法案例

16.1 Apriori包编写

seq1 = [           [30], [90]          ]
seq2 = [ [10, 20], [30], [40, 60, 70]  ]
seq3 = [         [30, 50, 70],         ]
seq4 = [      [30], [40, 70], [90]     ]
seq5 = [              [90],            ]
dataSet = [seq1, seq2, seq3, seq4, seq5]
min_support=0.25

1、Sort Phase

过程略,直接使用整理过的示例数据

2、Litemset Phase

搜索litemset,直接使用apriori算法,主要的区别在于计算支持度时,一个客户customer购买了同样的项集(itemset)时,支持频度仅计算一次。这是因为在apriori算法中,支持度是对交易(transaction)而言的,但在序列模式的计算中,大项集的支持度是对客户(customer)而言的

import os
os.chdir(r"H:\假期\myGitHub\Machine_Learning\练习\数据科学")
import sys
sys.path.append('./myscripts')

import itertools
import pandas as pd
from apriori import apriori
apriori imported!
def createLs1(dataSet, min_support):
    '''
    Using  algorithm apriorito mining large 1-sequences (1-项集)
    `Ls` for Large Sequence
    '''
    n = len(dataSet)
    flattenSet = list(itertools.chain(*dataSet))  #itertools.chain
#     print("flattenSet:"+ str(flattenSet))
    flatten_n = len(flattenSet)
#     print("flatten_n:"+ str(flatten_n))
    
    # Transform the min_support to litemset_support
    min_support_new = min_support * n /flatten_n
#     print("min_support_new:"+ str(min_support_new))
    litemsets = apriori(flattenSet, min_support=min_support_new)
#     print("litemsets:"+ str(litemsets))
    mapping = {v: k for k, v in enumerate(litemsets)}
    
    # Transform the litemset_support to sequence_support
    supportLs1 = {(mapping[k],): v *flatten_n / n
                     for k, v in litemsets.items()}
    return mapping, supportLs1
#测试
mapping, supportLs1 = createLs1(dataSet, min_support=min_support)
mapping
flattenSet:[[30], [90], [10, 20], [30], [40, 60, 70], [30, 50, 70], [30], [40, 70], [90], [90]]
flatten_n:10
min_support_new:0.125
litemsets:{frozenset({30}): 0.4, frozenset({90}): 0.3, frozenset({70}): 0.3, frozenset({40}): 0.2, frozenset({40, 70}): 0.2}

{frozenset({30}): 0,
 frozenset({90}): 1,
 frozenset({70}): 2,
 frozenset({40}): 3,
 frozenset({40, 70}): 4}
supportLs1
{(0,): 0.8, (1,): 0.6, (2,): 0.6, (3,): 0.4, (4,): 0.4}
Ls1 = [list(k) for k in supportLs1]
Ls1
[[0], [1], [2], [3], [4]]

3、Transformation Phase

def seqMapping(seq, mapping):
    '''
    Mapping litemsets to integer objects, for treating litemsets as
    single entities, and reducing the time required 
    '''
    newSeq = []
    for iSet in seq:
        newSet = [v for k, v in mapping.items() if k <= set(iSet)]
        if newSet != []:
            newSeq.append(newSet)
            
    return newSeq

def transform(dataSet, mapping):
    '''
    Transform each customer sequence into an alternative representation.
    '''
    transformDS = []
    for seq in dataSet:
        newSeq = seqMapping(seq, mapping)
        if newSeq != []:
            transformDS.append(newSeq)
    return transformDS
# - 测试
transformDS  = transform(dataSet, mapping)
for seq in transformDS :
    print(seq)
[[0], [1]]
[[0], [2, 3, 4]]
[[0, 2]]
[[0], [2, 3, 4], [1]]
[[1]]

4、Sequence Phase

# 产生候选序列
def seqGen(seqA, seqB):
    '''
    Generate candidate k+1 sequences with two large k-sequences
    '''
    newA, newB = seqA.copy(), seqB.copy()
    if seqA[:-1] == seqB[:-1]: #比较除最后一个元素外的所有元素
        newA.append(seqB[-1])
        newB.append(seqA[-1])
        return [newA, newB]

def CsGen(Ls):
    '''
    Generate all candidate k+1 sequences from large k-sequences
    '''
    Cs = []
    for seqA, seqB in itertools.combinations(Ls, 2):#返回iterable中所有长度为2的子序列,返回的子序列中的项按输入iterable中的顺序排序
#         print(str(seqA)+'--'+str(seqB))
        newSeqs = seqGen(seqA, seqB)
#         print(newSeqs)
        if newSeqs != None:
            Cs.extend(newSeqs)
#             print("CS:"+str(Cs))
    return [seq for seq in Cs if seq[1:] in Ls] #  Pruning 
# - 测试

testLs = [
    [1, 2, 3], 
    [1, 2, 4],
    [1, 3, 4],
    [1, 3, 5],
    [2, 3, 4]]
CsGen(testLs)
[1, 2, 3]--[1, 2, 4]
[[1, 2, 3, 4], [1, 2, 4, 3]]
CS:[[1, 2, 3, 4], [1, 2, 4, 3]]
[1, 2, 3]--[1, 3, 4]
None
[1, 2, 3]--[1, 3, 5]
None
[1, 2, 3]--[2, 3, 4]
None
[1, 2, 4]--[1, 3, 4]
None
[1, 2, 4]--[1, 3, 5]
None
[1, 2, 4]--[2, 3, 4]
None
[1, 3, 4]--[1, 3, 5]
[[1, 3, 4, 5], [1, 3, 5, 4]]
CS:[[1, 2, 3, 4], [1, 2, 4, 3], [1, 3, 4, 5], [1, 3, 5, 4]]
[1, 3, 4]--[2, 3, 4]
None
[1, 3, 5]--[2, 3, 4]
None

[[1, 2, 3, 4]]
# 测试
for seqA, seqB in itertools.combinations(testLs, 2):
#     print(seqA)
    print(str(seqA)+'--'+str(seqB))
[1, 2, 3]--[1, 2, 4]
[1, 2, 3]--[1, 3, 4]
[1, 2, 3]--[1, 3, 5]
[1, 2, 3]--[2, 3, 4]
[1, 2, 4]--[1, 3, 4]
[1, 2, 4]--[1, 3, 5]
[1, 2, 4]--[2, 3, 4]
[1, 3, 4]--[1, 3, 5]
[1, 3, 4]--[2, 3, 4]
[1, 3, 5]--[2, 3, 4]
seq = [3, 4, 8]
seq[1:]
[4, 8]
# 子序列判断

def isSubSeq(seq, cusSeq):
    '''
    Check if a sequence is contained in a customer sequence.
    '''
    nSeq, nCusSeq = len(seq), len(cusSeq)
    if nSeq > nCusSeq:
        return False 
    if nSeq == 1:        
        return any([seq[0] in i for i in cusSeq])
    if nSeq > 1 :
        head = [seq[0] in i for i in cusSeq]
        if any(head):
            split = head.index(True)
            return isSubSeq(seq[1:], cusSeq[split + 1:]) # Recursion
        else:
            return False
# - 测试

seq = [3, 4, 8]
cusSeq = [[7], [3, 8], [9], [4, 5, 6], [8]]
isSubSeq(seq, cusSeq)
True
# 产生频繁k序列,此步骤需要迭代执行

def calcSupport(transformDS, Cs, min_support):
    '''
    Return: 1. a list of large-sequences
            2. a dictionary of `large-sequence: support` pairs
    '''
    supportLsk = {}; n = len(transformDS)
    if len(Cs) >= 1:
        for seq in Cs:
            support = sum([isSubSeq(seq, cusSeq) for cusSeq in transformDS]                         ) / n
            if support >= min_support:
                supportLsk.update({tuple(seq): support})
    return [list(k) for k in supportLsk], supportLsk       

# - 测试

Cs2 = CsGen(Ls1) #LS1:[[0], [1], [2], [3], [4]]
Ls2, supportLs2 = calcSupport(transformDS, Cs2, min_support)
# print(Ls2)
# print(supportLs2)

    


```python
print(Ls2)
print(supportLs2)
[[0, 1], [0, 2], [0, 3], [0, 4]]
{(0, 1): 0.4, (0, 2): 0.4, (0, 3): 0.4, (0, 4): 0.4}

5、Maximal Phase

  • 一个更快速搜寻子序列的算法可参考:
    R. Agrawal and R. Srikant. Mining sequential patterns. Research Report RJ 9910, IBM Almaden Research Center, San Jose, California, Oc
    tober 1994.

需要将大序列中的项集转换回原始的购物篮再进行序列最大化

tr_mapping = {v: k for k, v in mapping.items()}
Ls = Ls1 + Ls2
Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]

supportLs = {}
supportLs.update(supportLs1); supportLs.update(supportLs2)
supportLs = {tuple([tr_mapping[i] for i in k]):v for k, v in 
             supportLs.items()}

print(supportLs)
{(frozenset({0}),): 0.8, (frozenset({1}),): 0.6, (frozenset({2}),): 0.6, (frozenset({3}),): 0.4, (frozenset({4}),): 0.4, (frozenset({3, 4}),): 0.4, (frozenset({2, 3}),): 0.4, (frozenset({2, 4}),): 0.4, (frozenset({2, 3, 4}),): 0.4, (frozenset({0}), frozenset({1})): 0.4, (frozenset({0}), frozenset({2})): 0.4, (frozenset({0}), frozenset({3})): 0.4, (frozenset({0}), frozenset({4})): 0.4}
# 序列最大化需要保留的是某个序列的非空真子序列
# (类似于非空真子集,此处要保留该序列本身),
# 该步骤与Transformation阶段中判断子序列的方法类似,
# 区别在于已经将其中的项集映射回来了,因此稍作修改


def isSubSeq2(seq, cusSeq):
    nSeq, nCusSeq = len(seq), len(cusSeq)
    
    if nSeq > nCusSeq:
        return False 
    if nSeq == 1:        
        return any([seq[0].issubset(i) for i in cusSeq])
    if nSeq > 1 :
        head = [seq[0].issubset(i) for i in cusSeq]
        if any(head):
            split = head.index(True)
            return isSubSeq2(seq[1:], cusSeq[split:]) # Recursion
        else:
            return False           

def notProperSubSeq(seq, cusSeq):
    '''
    Return True if `seq` is not proper sub sequence of `cusSeq`
    '''
    if seq == cusSeq:
        return True
    else:
        return not isSubSeq2(seq, cusSeq)

# 将备选序列中的最大化的序列保留下来

def maxLs(Ls, supportLs):
    LsCopy = Ls.copy()
    lenL, lenC = len(Ls), len(LsCopy)
    while lenC > 1 and lenL > 1:
        if LsCopy[lenC - 1] in Ls:
            mask = [notProperSubSeq(seq, LsCopy[lenC - 1]) for seq in Ls]
            Ls = list(itertools.compress(Ls, mask))
            lenL = len(Ls)
            
        lenC -= 1
        
    supportLs = {tuple(seq): supportLs[tuple(seq)] for seq in Ls} 
    return Ls, supportLs
Ls, supportLs = maxLs(Ls, supportLs)
supportLs
{(frozenset({0}), frozenset({1})): 0.4,
 (frozenset({0}), frozenset({2})): 0.4,
 (frozenset({0}), frozenset({3})): 0.4,
 (frozenset({0}), frozenset({4})): 0.4}

6、aprioriAll

# ### aprioriAll


def aprioriAll(dataSet, min_support=0.4):
    '''
    Proceeding aprioriall algorithm to mining sequential patterns
    
    Refer to:    
    Agrawal,R.,Srikant,R.,Institute of Electric and Electronic 
    Engineer et al. Mining sequential patterns[C]. Proceedings 
    of the Eleventh International Conference on Data Engineering,
    Washington DC, USA: IEEE Computer Society,1995:3-14.
    '''
    # Litemset Phase
    mapping, supportLs1 = createLs1(dataSet, min_support)
    Ls1 = [list(k) for k in supportLs1]
    
    # Transformation Phase
    transformDS  = transform(dataSet, mapping)
    
    # Sequence Phase
    LsList = [Ls1]; supportLs = supportLs1.copy()
    k = 1
    while k >= 1 and len(LsList[-1]) > 1:
        Csk = CsGen(LsList[-1])
        Lsk, supportLsk = calcSupport(transformDS, Csk, min_support)
        if len(Lsk) > 0:
            LsList.append(Lsk); supportLs.update(supportLsk)
            k += 1
        else:
            break
            
    Ls = list(itertools.chain(*LsList))
    tr_mapping = {v: k for k, v in mapping.items()}
    Ls = [[tr_mapping[k] for k in seq] for  seq in Ls ]
    supportLs = {tuple([tr_mapping[i] for i in k]):v 
                 for k, v in supportLs.items()}
    
    # Maximal Phase
    Ls, supportLs = maxLs(Ls, supportLs)
       
    return pd.DataFrame(list(supportLs.items()), 
                        columns=['sequence', 'support'])
aprioriAll(dataSet, min_support=0.25)
sequencesupport
0((30), (90))0.4
1((30), (40, 70))0.4

16.1 调用现成Apriori包

testSet = [
    [[1, 5], [2], [3], [4] ],
    [[1], [3], [4], [3, 5] ],
    [[1], [2], [3], [4]    ],
    [[1], [3], [5]         ],
    [[4], [5]              ]
    ]
import sys
sys.path.append('E:/myscripts')
from aprioriAll import aprioriAll
aprioriAll imported!
aprioriAll(testSet, min_support=0.4)
sequencesupport
0((4), (5))0.4
1((1), (3), (5))0.4
2((1), (2), (3), (4))0.4
import os, sys
import itertools
import pandas as pd

sys.path.append('./myscripts')

from aprioriAll import aprioriAll

transactions = pd.read_csv(r'.\data\Transactions.csv')
def aggFunc(*args):
    agg = itertools.chain(*args)
    return list(agg)

baskets = transactions['Model'].groupby([transactions['OrderNumber'], transactions['LineNumber']]).apply(aggFunc)
baskets.head()
OrderNumber  LineNumber
SO51176      1                     [Road-250]
             2             [Road Bottle Cage]
SO51177      1                 [Touring-2000]
             2                    [Sport-100]
SO51178      1                 [Mountain-200]
Name: Model, dtype: object
dataSet = list(baskets.groupby(level=0).apply(list))
dataSet[:3]
[[['Road-250'], ['Road Bottle Cage']],
 [['Touring-2000'], ['Sport-100']],
 [['Mountain-200'], ['Mountain Bottle Cage'], ['Water Bottle']]]
seq=aprioriAll(dataSet, min_support=0.04)
seq.head()
sequencesupport
0((Mountain-200),)0.116537
1((Road Tire Tube),)0.104258
2((Long-Sleeve Logo Jersey),)0.077252
3((Road-350-W),)0.043707
4((HL Mountain Tire),)0.062621
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

irober

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值