Python数据挖掘入门与实践-Apriori算法勘误


###############################################################################
#_____________________________________________________________________________#
#|      ______      ___    __________   ___      __                          |#
#|     /  __  \    /   |  /___   ___/  /   |    / /                          |#
#|    /  /  \  |  / /| |      /*      / /| |   / /      DataInsights         |#
#|   /  /   /  / / /_| |      `/     / / | |  / /                            |#
#|  /  /   /  / / ___  |     / `    / /  | | / /      751718003@qq.com       |#
#| /  /___/  / / /   | | ___ `/__  / /   | |/ /                              |#
#|/_________/ /_/    |_|/________//_/    |___/    https://data-insights.cn/  |#
#|___________________________________________________________________________|#
###############################################################################

Python数据挖掘入门与实践一书的第四章中演示了如何使用Apriori算法来进行电影推荐。
但是这里的算法计算出了一点小小的问题,下边贴的是正确的版本。

import pandas as pd
import sys
from collections import defaultdict
from functools import reduce
all_ratings = pd.read_csv('ml-1m/ratings.dat', header=None, names=['user_id', 'movie_id', 'rating', 'datetime'], delimiter='::', engine='python')
all_ratings['datetime'] = pd.to_datetime(all_ratings['datetime'], unit='s')
all_ratings.head()
user_idmovie_idratingdatetime
01119352000-12-31 22:12:40
1166132000-12-31 22:35:09
2191432000-12-31 22:32:48
31340842000-12-31 22:04:35
41235552001-01-06 23:38:11
all_ratings['favorable'] = all_ratings['rating'] > 3
all_ratings.head()
user_idmovie_idratingdatetimefavorable
01119352000-12-31 22:12:40True
1166132000-12-31 22:35:09False
2191432000-12-31 22:32:48False
31340842000-12-31 22:04:35True
41235552001-01-06 23:38:11True
ratings = all_ratings[all_ratings['user_id'].isin(range(201))]
favorable_ratings = ratings[ratings['favorable']]
len(favorable_ratings.user_id.unique())
200
# 输出每个用户喜欢的电影列表
reviews_by_user = {}
for user_id, reviews in favorable_ratings.groupby('user_id')['movie_id']:
    reviews_by_user[user_id] = set(reviews.values)

# 输出L1频繁项集
frequent_itemsets = {}
min_support = 50
tmp = favorable_ratings.groupby('movie_id')['favorable'].count().reset_index()
frequent_itemsets[1] = {frozenset((row['movie_id'],)): row['favorable'] for index, row in tmp.iterrows() if row['favorable'] >= min_support}

print(len(reviews_by_user))
200
# 求频繁项集函数
def find_frequent_itemsets(reviews_by_user, frequent_itemsets, k, min_support=50):
    counts = defaultdict(int)
    
    # 上一级频繁项集中出现的item
    frequent_movies = reduce(lambda x, y: x | y, frequent_itemsets[k-1].keys())
    
    # 对每个用户及其看过的电影进行循环
    for user_id, movie_ids in reviews_by_user.items():
        # 剔除未出现在上一级频繁项集中的item
        movie_ids_filtered = frequent_movies & movie_ids
        
        # 对上一级所有的频繁项集进行循环
        for frequent_movie_id, review_cnt in frequent_itemsets[k-1].items():
            
            # 如果该用户看过该频繁项集中的所有电影
            if frequent_movie_id.issubset(movie_ids_filtered):
                
                # 对该用户看过的其他频繁电影进行循环创建超集并计数
                for movie_id in movie_ids_filtered - frequent_movie_id:
                    counts[frequent_movie_id | frozenset((movie_id,))] += 1
    # 对每个新的超集,其构成方式均
    itemsets = {itemset: frequency/k for itemset, frequency in counts.items() if frequency/k >= min_support}
    return itemsets
# 执行计算
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(reviews_by_user, frequent_itemsets, k, min_support=min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    print('{0} frequent itemsets found in L{1}'.format(len(cur_frequent_itemsets), k))
    if len(cur_frequent_itemsets) == 0:
        sys.stdout.flush()
        del frequent_itemsets[k]
        break
del frequent_itemsets[1]
10 frequent itemsets found in L2
1 frequent itemsets found in L3
0 frequent itemsets found in L4
frequent_itemsets
{2: {frozenset({1196, 1210}): 58.0,
  frozenset({1196, 2571}): 51.0,
  frozenset({1196, 1198}): 62.0,
  frozenset({1198, 1210}): 50.0,
  frozenset({2028, 2858}): 52.0,
  frozenset({2858, 3578}): 52.0,
  frozenset({260, 1196}): 61.0,
  frozenset({260, 1198}): 57.0,
  frozenset({260, 1210}): 52.0,
  frozenset({2762, 2858}): 51.0},
 3: {frozenset({260, 1196, 1198}): 51.0}}
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

量化祛魅师

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值