python_movie_apriori

#! /usr/bin/env python
#coding=utf-8
import pandas as pd
root="F:/Data/exe/ml-100k/"
all_ratings=pd.read_csv(root+"u.data",delimiter="\t",header=None)#原始数据是制表符且没有表头
all_ratings.columns=(["UserId","MovieId","Rating","DateTime"])#Rating:用户给该电影打分,满分为5all_ratings["DateTime"]=pd.to_datetime(all_ratings["DateTime"],unit="s")#解析时间戳数据
all_ratings["Favorable"]=all_ratings["Rating"]>3#增加用户是否喜欢这部电影的特征
ratings=all_ratings[all_ratings["UserId"].isin(range(200))]
#200个不同的用户的数据,而不是前200条数据,共有一万多条数据
#目标:如果用户喜欢某些电影,那么他们也将喜欢这部电影
favorable_ratings=ratings[ratings["Favorable"]]#200个用户喜欢某部电影的数据行
#按照用户分组,frozenset(v.values)存储每个用户喜爱的电影
favorable_reviews_by_users=dict((k,frozenset(v.values))
                                for k,v in favorable_ratings.groupby("UserId")["MovieId"])
#按电影分组,记录每个电影的评分总和
num_favorable_by_movie=ratings[["MovieId","Favorable"]].groupby("MovieId").sum()
#查看最受欢迎的5部电影
#print (num_favorable_by_movie.sort("Favorable",ascending=False)[:5])
#%%%%%%%%%%%%%%%%%%%%%%%%Apriori%%%%%%%%%%%%%%%
frequent_itemsets={}
min_support=50
#电影编号使用frozenset,原因:一当做集合,二当做字典的键(普通的集合不可以
frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"])
                          for movie_id,row in num_favorable_by_movie.iterrows()
                          if row["Favorable"]>min_support)

print(frequent_itemsets[1])
from collections import defaultdict
#函数:定义新发现的频繁集合,创建超集,检测频繁程度

def find_frequent_itemsets(favorable_reviews_by_users,k_1_itermsets,min_support):
    counts=defaultdict(int)#字典初始化
    for user,reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itermsets:#遍历前面找出来的项集
            if itemset.issubset(reviews):
                #判断是否是用户当前评分项集中的子集,如果是表示用户已经为子集中的电影评过分
                for other_reviewed_movie in reviews-itemset:
                    #遍历用户评过分却没有出现在项集中的电影,用它生成超集,更新该项集的计数
                    current_superset=itemset|frozenset((other_reviewed_movie,))
                    counts[current_superset]+=1
    #返回其中的频繁项集
    return dict([(itemset,frequent)for itemset,frequent in counts.items() if frequent>=min_support])
import sys
for k in range(2,20):
    cur_frequent_itemsets=\
        find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
    frequent_itemsets[k]=cur_frequent_itemsets
    #未找到新的频繁集
    if len(cur_frequent_itemsets)==0:
        print("Did not find any frequent itemsets of lenth{}".format(k))
        sys.stdout.flush()
        break
    #找到新的频繁集
    else:
        print("I find {} frequent itemsets of lenth {}".format(len(cur_frequent_itemsets),k))
        sys.stdout.flush()
del frequent_itemsets[1]
#关联规则:如果用户喜欢前提中的所有电影,那么他也会喜欢结论中的电影
candidate_rules=[]
for itemset_length,itemset_counts in frequent_itemsets.items():
    #frequent_itemsets:字典 itemset_length:键:项集长度 itemset_counts:值:同一项集长度下的多个项集
    for itemset in itemset_counts.keys():
        #itemset_counts:字典(键:电影id,值电影的喜爱度) itemset:一个项集的多个电影id
        for conclusion in itemset:
            premise=itemset-set((conclusion,))#premise前提可能有很多,conclusion是一个
            candidate_rules.append((premise,conclusion))
print(candidate_rules[:5])
#计算规则的置信度
correct_counts=defaultdict(int)#就是字典的值是int型的,键不管,无论初始化默认任何键值都为0
incorrect_counts=defaultdict(int)
for user,reviews in favorable_reviews_by_users.items():
    #遍历所有用户及其喜爱的电影
    for candidate_rule in candidate_rules:
        premise,conclusion=candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule]+=1#对于一个用户,是正确规则则加1,否则为错误规则加1
            else:
                incorrect_counts[candidate_rule]+=1
#计算每条适用规则的置信度,也是定义了一个字典
rule_confidence={candidate_rule:correct_counts[candidate_rule]/
                                float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])
                                for candidate_rule in candidate_rules}

root="F:/Data/exe/ml-100k/"
movie_name_data=pd.read_csv(root+"u.item",delimiter="|",header=None,encoding="mac-roman")
movie_name_data.columns=["Movie_Id","Title","Release Date",
                         "Video Release","IMDB","<UNK>",
                         "Action","Adventure","Animation",
                         "Children","Comedy","Crim",
                         "Documentary","Drama","Fantasy",
                         "Film-Noir","Horror","Musical",
                         "Mystery","Romance","Sci-Fi",
                         "Thriller","War","Western"]
#用电影编号获得电影
def get_movie_name(movie_id):
    title_object=movie_name_data[movie_name_data["Movie_Id"]==movie_id]["Title"]
    title=title_object.values[0]#title_object得到的是series对象,我们只对第一个值(电影名称)感兴趣
    return title
#print(get_movie_name(1#获取置信度排序前五的规则
#获取置信度排序前五的规则
from operator import itemgetter
sorted_confidence=sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{}".format(index+1))
    (premise,conclusion)=sorted_confidence[index][0]
    premise_names=",".join(get_movie_name(idx)for idx in premise)
    conclusion_name=get_movie_name(conclusion)
    print("Rule:If a person recommends {0} they will also recommend {1}".format(premise_names,conclusion_name))
    print("-Confidence: {}".format(rule_confidence[(premise,conclusion)]))
    print("")
#训练集用了前200为用户,测试集用余下的数据,为测试集中每一位用户获取最喜欢的电影
test_dataset=all_ratings[~all_ratings["UserId"].isin(range(200))]#剩下用户的数据集
test_favorable=test_dataset[test_dataset["Favorable"]]#剩下用户喜爱的评分的数据集
test_favorable_by_users=dict((k,frozenset(v.values))
                             for k,v in test_favorable.groupby("UserId")["MovieId"])
                            #存储剩下每个用户的id以及他们喜欢的电影
#计算测试集中上述给的规则的应验数量
correct_counts=defaultdict(int)
incorrect_counts=defaultdict(int)
for user,reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise,conclusion=candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule]+=1
            else:
                incorrect_counts[candidate_rule]+=1
#计算应验规则的置信度
test_confidence={candidate_rule:
                correct_counts[candidate_rule]
                /float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])
                 for candidate_rule in candidate_rules}
for index in range(5):
    print("Rule {}".format(index+1))
    (premise,conclusion)=sorted_confidence[index][0]
    premise_names=",".join(get_movie_name(idx)for idx in premise)
    conclusion_name=get_movie_name(conclusion)
    print("Rule:If a person recommends {0} they will also recommend {1}"
          .format(premise_names,conclusion_name))
    print("-Train Confidence:{}").format(rule_confidence[(premise,conclusion)])
    print("-Test Confidence:{}").format(test_confidence[(premise,conclusion)])
F:\Amy\anaconnda\python.exe F:/Amy/3_python文件/exe/2_movies_apriori.py
{frozenset([286]): 59.0, frozenset([7]): 67.0, frozenset([64]): 58.0, frozenset([79]): 58.0, frozenset([258]): 83.0, frozenset([50]): 100.0, frozenset([313]): 60.0, frozenset([174]): 74.0, frozenset([100]): 89.0, frozenset([181]): 79.0, frozenset([1]): 66.0, frozenset([127]): 70.0, frozenset([172]): 59.0, frozenset([98]): 70.0, frozenset([56]): 67.0, frozenset([9]): 53.0}
I find 93 frequent itemsets of lenth 2
I find 295 frequent itemsets of lenth 3
I find 593 frequent itemsets of lenth 4
I find 785 frequent itemsets of lenth 5
I find 677 frequent itemsets of lenth 6
I find 373 frequent itemsets of lenth 7
I find 126 frequent itemsets of lenth 8
I find 24 frequent itemsets of lenth 9
I find 2 frequent itemsets of lenth 10
Did not find any frequent itemsets of lenth11
[(frozenset([50]), 64), (frozenset([64]), 50), (frozenset([127]), 181), (frozenset([181]), 127), (frozenset([127]), 1)]
Rule #1
Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977)
-Confidence: 1.0


Rule #2
Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)
-Confidence: 1.0


Rule #3
Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)
-Confidence: 1.0


Rule #4
Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994)
-Confidence: 1.0


Rule #5
Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977)
-Confidence: 1.0


Rule 1
Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977)
-Train Confidence:1.0
-Test Confidence:0.965517241379
Rule 2
Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)
-Train Confidence:1.0
-Test Confidence:0.853658536585
Rule 3
Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)
-Train Confidence:1.0
-Test Confidence:0.869565217391
Rule 4
Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994)
-Train Confidence:1.0
-Test Confidence:0.755555555556
Rule 5
Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977)
-Train Confidence:1.0
-Test Confidence:0.975


进程已结束,退出代码0










  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Apriori算法是一种常用的关联规则挖掘算法,它可以用来发现数据集中的频繁项集。Apriori算法的基本思想是通过迭代的方式,逐层筛选出频繁项集。 Apriori算法的原理: 1. 对数据集中的每个项进行计数,得到每个项的支持度; 2. 根据支持度,得到满足最小支持度要求的频繁项集; 3. 根据频繁项集,生成候选项集; 4. 根据候选项集,计算支持度,得到满足最小支持度要求的频繁项集; 5. 重复步骤3~4,直到没有满足最小支持度要求的频繁项集为止。 下面是Apriori算法的Python代码实现: ```python # 计算支持度 def support_count(data, itemset, min_support): count = 0 for d in data: if set(itemset).issubset(set(d)): count += 1 support = float(count) / len(data) return support >= min_support, count # 生成候选项集 def candidate_itemsets(itemsets, k): candidate = [] for i in range(len(itemsets)): for j in range(i + 1, len(itemsets)): l1 = list(itemsets[i])[:k - 2] l2 = list(itemsets[j])[:k - 2] l1.sort() l2.sort() if l1 == l2: candidate.append(itemsets[i] | itemsets[j]) return candidate # Apriori算法主函数 def apriori(data, min_support): itemsets = [] for d in data: for item in d: if not {item} in itemsets: itemsets.append({item}) itemsets.sort() freq_itemsets = [] k = 2 while True: candidate = candidate_itemsets(itemsets, k) freq_itemset = [] for c in candidate: is_freq, count = support_count(data, c, min_support) if is_freq: freq_itemset.append((c, count)) freq_itemsets += freq_itemset if len(freq_itemset) == 0: break itemsets = [f[0] for f in freq_itemset] k += 1 return freq_itemsets ``` 以上是Apriori算法的Python代码实现,使用时只需要传入数据集和最小支持度即可得到频繁项集。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值