电影推荐_亲和性分析_规则提取(数据挖掘入门与实践-实验6)

#数据导入
import os
import pandas as pd

all_ratings=pd.read_csv("ml-100k/u.data", delimiter="\t", header=None, names=["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datatime"]=pd.to_datetime(all_ratings["Datetime"], unit='s')
all_ratings=all_ratings[["UserID", "MovieID", "Rating", "Datatime"]]
#all_ratings[:5]

#是否电影喜爱特征添加
all_ratings["Favorable"]=all_ratings["Rating"]>3
#all_ratings[:15]

#UserID<=200的数据截取
ratings_200=all_ratings[all_ratings["UserID"].isin(range(200))]
#print(ratings_200)

#用户喜爱电影提取
favorable_ratings = ratings_200[ratings_200["Favorable"]]
#print(favorable_ratings)
favorable_reviews_by_users=dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
#print(favorable_reviews_by_users)

#最受欢迎电影提取
nums_favorable_by_movie=ratings_200[["MovieID", "Favorable"]].groupby("MovieID").sum()
#nums_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

####################Aprior算法

#频繁集字典
frequent_itemsets = {}

#最小支持度
min_support=50

#备选项集生成
frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"]) for movie_id,row in nums_favorable_by_movie.iterrows() if row["Favorable"]>min_support)
#print(frequent_itemsets[1])

#备选集检测
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users,k_l_itemsets, min_support):
    counts=defaultdict(int)
    for user,reviews in favorable_reviews_by_users.items():
        for itemset in k_l_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset: 
                    current_superset = itemset | frozenset((other_reviewed_movie,)) 
                    counts[current_superset] += 1
    
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

#循环创建,运行Apriori
import sys
for k in range(2, 20): 
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support) 
    frequent_itemsets[k] = cur_frequent_itemsets
    
    #检测频繁集项数
    if len(cur_frequent_itemsets) == 0: 
        #print("Did not find any frequent itemsets of length {}". 
        format(k)) 
        sys.stdout.flush() 
        break
    else: 
        #print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k)) 
        sys.stdout.flush()

del frequent_itemsets[1]

#规则生成
candidate_rules = [] 
for itemset_length, itemset_counts in frequent_itemsets.items(): 
    for itemset in itemset_counts.keys(): 
        for conclusion in itemset: 
            premise = itemset - set((conclusion,)) 
            candidate_rules.append((premise, conclusion))
candidate_rules[:5]

#规则置信度计算
correct_counts = defaultdict(int) 
incorrect_counts = defaultdict(int)

for user, reviews in favorable_reviews_by_users.items(): 
    for candidate_rule in candidate_rules: 
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews: 
                correct_counts[candidate_rule] += 1 
            else: 
                incorrect_counts[candidate_rule] += 1

rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}

from operator import itemgetter 
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True) 
for index in range(5): 
    #print("Rule #{0}".format(index + 1)) 
    (premise, conclusion) = sorted_confidence[index][0]
    #print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion)) 
    #print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)])) 
    #print("")

#ml-100k/u.item
#u.items数据加载
movie_name_data = pd.read_csv("ml-100k/u.item", delimiter="|",header=None, encoding = "mac-roman") 
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] ==movie_id]["Title"]
    title = title_object.values[0]
    return title

for index in range(5): 
    #print("Rule #{0}".format(index + 1)) 
    (premise, conclusion) = sorted_confidence[index][0] 
    premise_names = ", ".join(get_movie_name(idx) for idx in premise) 
    conclusion_name = get_movie_name(conclusion) 
    #print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name)) 
    #print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)])) 
#print("")

#training && test

test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))] 
test_favorable = test_dataset[test_dataset["Favorable"]] 
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])

correct_counts = defaultdict(int) 
incorrect_counts = defaultdict(int) 
for user, reviews in test_favorable_by_users.items(): 
    for candidate_rule in candidate_rules: 
        premise, conclusion = candidate_rule 
        if premise.issubset(reviews): 
            if conclusion in reviews: 
                correct_counts[candidate_rule] += 1 
            else: 
                incorrect_counts[candidate_rule] += 1

test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts [candidate_rule]) for candidate_rule in rule_confidence}

for index in range(5): 
    #print("Rule #{0}".format(index + 1)) 
    (premise, conclusion) = sorted_confidence[index][0] 
    premise_names = ", ".join(get_movie_name(idx) for idx in premise) 
    conclusion_name = get_movie_name(conclusion) 
    #print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name)) 
    #print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1))) 
    #print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1))) 
#print("")



  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值