#数据导入
import os
import pandas as pd
all_ratings=pd.read_csv("ml-100k/u.data", delimiter="\t", header=None, names=["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datatime"]=pd.to_datetime(all_ratings["Datetime"], unit='s')
all_ratings=all_ratings[["UserID", "MovieID", "Rating", "Datatime"]]
#all_ratings[:5]
#是否电影喜爱特征添加
all_ratings["Favorable"]=all_ratings["Rating"]>3
#all_ratings[:15]
#UserID<=200的数据截取
ratings_200=all_ratings[all_ratings["UserID"].isin(range(200))]
#print(ratings_200)
#用户喜爱电影提取
favorable_ratings = ratings_200[ratings_200["Favorable"]]
#print(favorable_ratings)
favorable_reviews_by_users=dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
#print(favorable_reviews_by_users)
#最受欢迎电影提取
nums_favorable_by_movie=ratings_200[["MovieID", "Favorable"]].groupby("MovieID").sum()
#nums_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]
####################Aprior算法
#频繁集字典
frequent_itemsets = {}
#最小支持度
min_support=50
#备选项集生成
frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"]) for movie_id,row in nums_favorable_by_movie.iterrows() if row["Favorable"]>min_support)
#print(frequent_itemsets[1])
#备选集检测
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users,k_l_itemsets, min_support):
counts=defaultdict(int)
for user,reviews in favorable_reviews_by_users.items():
for itemset in k_l_itemsets:
if itemset.issubset(reviews):
for other_reviewed_movie in reviews - itemset:
current_superset = itemset | frozenset((other_reviewed_movie,))
counts[current_superset] += 1
return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
#循环创建,运行Apriori
import sys
for k in range(2, 20):
cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
frequent_itemsets[k] = cur_frequent_itemsets
#检测频繁集项数
if len(cur_frequent_itemsets) == 0:
#print("Did not find any frequent itemsets of length {}".
format(k))
sys.stdout.flush()
break
else:
#print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
sys.stdout.flush()
del frequent_itemsets[1]
#规则生成
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
for conclusion in itemset:
premise = itemset - set((conclusion,))
candidate_rules.append((premise, conclusion))
candidate_rules[:5]
#规则置信度计算
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
#print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
#print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
#print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
#print("")
#ml-100k/u.item
#u.items数据加载
movie_name_data = pd.read_csv("ml-100k/u.item", delimiter="|",header=None, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
def get_movie_name(movie_id):
title_object = movie_name_data[movie_name_data["MovieID"] ==movie_id]["Title"]
title = title_object.values[0]
return title
for index in range(5):
#print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
premise_names = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
#print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
#print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
#print("")
#training && test
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts [candidate_rule]) for candidate_rule in rule_confidence}
for index in range(5):
#print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
premise_names = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
#print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
#print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
#print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
#print("")
电影推荐_亲和性分析_规则提取(数据挖掘入门与实践-实验6)
最新推荐文章于 2024-10-17 20:17:57 发布
该博客介绍了如何运用Apriori算法分析用户对电影的评分数据,提取用户喜好特征。首先,从ml-100k数据集中加载用户评分,并根据评分大于3作为用户喜爱的标准。接着,筛选出 UserID 小于200的用户数据,进一步提取用户喜爱的电影。然后,通过Apriori算法找出频繁项集,并计算置信度生成推荐规则。最后,计算规则在训练集和测试集上的置信度,展示了一些高置信度的推荐规则。
摘要由CSDN通过智能技术生成