Python数据挖掘入门与实践一书的第四章中演示了如何使用Apriori算法来进行电影推荐。
但是这里的算法计算出了一点小小的问题,下边贴的是正确的版本。
import pandas as pd
import sys
from collections import defaultdict
from functools import reduce
all_ratings = pd.read_csv('ml-1m/ratings.dat', header=None, names=['user_id', 'movie_id', 'rating', 'datetime'], delimiter='::', engine='python')
all_ratings['datetime'] = pd.to_datetime(all_ratings['datetime'], unit='s')
all_ratings.head()
| user_id | movie_id | rating | datetime |
---|
0 | 1 | 1193 | 5 | 2000-12-31 22:12:40 |
---|
1 | 1 | 661 | 3 | 2000-12-31 22:35:09 |
---|
2 | 1 | 914 | 3 | 2000-12-31 22:32:48 |
---|
3 | 1 | 3408 | 4 | 2000-12-31 22:04:35 |
---|
4 | 1 | 2355 | 5 | 2001-01-06 23:38:11 |
---|
all_ratings['favorable'] = all_ratings['rating'] > 3
all_ratings.head()
| user_id | movie_id | rating | datetime | favorable |
---|
0 | 1 | 1193 | 5 | 2000-12-31 22:12:40 | True |
---|
1 | 1 | 661 | 3 | 2000-12-31 22:35:09 | False |
---|
2 | 1 | 914 | 3 | 2000-12-31 22:32:48 | False |
---|
3 | 1 | 3408 | 4 | 2000-12-31 22:04:35 | True |
---|
4 | 1 | 2355 | 5 | 2001-01-06 23:38:11 | True |
---|
ratings = all_ratings[all_ratings['user_id'].isin(range(201))]
favorable_ratings = ratings[ratings['favorable']]
len(favorable_ratings.user_id.unique())
200
reviews_by_user = {}
for user_id, reviews in favorable_ratings.groupby('user_id')['movie_id']:
reviews_by_user[user_id] = set(reviews.values)
frequent_itemsets = {}
min_support = 50
tmp = favorable_ratings.groupby('movie_id')['favorable'].count().reset_index()
frequent_itemsets[1] = {frozenset((row['movie_id'],)): row['favorable'] for index, row in tmp.iterrows() if row['favorable'] >= min_support}
print(len(reviews_by_user))
200
def find_frequent_itemsets(reviews_by_user, frequent_itemsets, k, min_support=50):
counts = defaultdict(int)
frequent_movies = reduce(lambda x, y: x | y, frequent_itemsets[k-1].keys())
for user_id, movie_ids in reviews_by_user.items():
movie_ids_filtered = frequent_movies & movie_ids
for frequent_movie_id, review_cnt in frequent_itemsets[k-1].items():
if frequent_movie_id.issubset(movie_ids_filtered):
for movie_id in movie_ids_filtered - frequent_movie_id:
counts[frequent_movie_id | frozenset((movie_id,))] += 1
itemsets = {itemset: frequency/k for itemset, frequency in counts.items() if frequency/k >= min_support}
return itemsets
for k in range(2, 20):
cur_frequent_itemsets = find_frequent_itemsets(reviews_by_user, frequent_itemsets, k, min_support=min_support)
frequent_itemsets[k] = cur_frequent_itemsets
print('{0} frequent itemsets found in L{1}'.format(len(cur_frequent_itemsets), k))
if len(cur_frequent_itemsets) == 0:
sys.stdout.flush()
del frequent_itemsets[k]
break
del frequent_itemsets[1]
10 frequent itemsets found in L2
1 frequent itemsets found in L3
0 frequent itemsets found in L4
frequent_itemsets
{2: {frozenset({1196, 1210}): 58.0,
frozenset({1196, 2571}): 51.0,
frozenset({1196, 1198}): 62.0,
frozenset({1198, 1210}): 50.0,
frozenset({2028, 2858}): 52.0,
frozenset({2858, 3578}): 52.0,
frozenset({260, 1196}): 61.0,
frozenset({260, 1198}): 57.0,
frozenset({260, 1210}): 52.0,
frozenset({2762, 2858}): 51.0},
3: {frozenset({260, 1196, 1198}): 51.0}}