基于物品的协同过滤算法
基于物品的协同过滤算法是目前业界应用最多的算法。算法主要分为两步
- 计算物品之间的相似度
- 根据物品的相似度和用户的历史行为给用户生成推荐列表
其中,分母是喜欢物品i的用户数,而分子是同时喜欢物品i和物品j的用户数,因此上述公式可以理解为喜欢物品i的用户中有多少比例的用户也喜欢物品j,但上述公式存在一个问题,如果物品j很热门,很多人喜欢,那么得到的W值就会很大,接近1。因此,该公式会造成任何物品都会和热门的物品有很大的相似度,这对于致力于挖掘长尾信息的推荐系统来说显然不是一个好的特性,为了避免则对上述公式做了改进:
这个公式惩罚了物品j的权重,因此减轻了热门物品会和很多物品相似的可能性。
相应的计算基于物品相似度的协同过滤算法可参考以下代码:
# coding=utf-8 import math import pandas as pd import tqdm from collections import defaultdict from sklearn.model_selection import train_test_split class KNN: def __init__(self, train=None, test=None): self.trainfile = train self.testfile = test self.readData() def readData(self, train=None, test=None): self.train_df = train or self.trainfile self.test_df = test or self.testfile self.traindata = {} self.testdata = {} box_id = list(self.train_df['box_id']) film_id = list(self.train_df['lable']) source = list(self.train_df['shichang']) box_id2 = list(self.test_df['box_id']) film_id2 = list(self.test_df['lable']) source2 = list(self.test_df['shichang']) for i in range(len(box_id)): userid, itemid, record = box_id[i], film_id[i], source[i] self.traindata.setdefault(userid, {}) self.traindata[userid][itemid] = record # print self.traindata for i in range(len(box_id2)): userid, itemid, record = box_id2[i], film_id2[i], source2[i] self.testdata.setdefault(userid, {}) self.testdata[userid][itemid] = record # print self.testdata def ItemSim(self, train=None): train = train or self.traindata ItemSimcount = dict() Item_count = dict() for _, items in train.items(): for itemidi in items.keys(): Item_count.setdefault(itemidi, 0) Item_count[itemidi] += 1 for itemidj in items.keys(): if itemidi == itemidj: continue ItemSimcount.setdefault(itemidi, {}) ItemSimcount[itemidi].setdefault(itemidj, 0) ItemSimcount[itemidi][itemidj] += 1 # self.ItemSimlist = dict() self.ItemSimlist = defaultdict(defaultdict) for itemidi, related_item in ItemSimcount.items(): self.ItemSimlist.setdefault(itemidi, {}) for itemidj, wij in related_item.items(): self.ItemSimlist[itemidi].setdefault(itemidj, 0) self.ItemSimlist[itemidi][itemidj] = wij / math.sqrt(Item_count[itemidi] * Item_count[itemidj] * 1.0) def recommend(self, user, train=None, k=5, nitem=10): train = train or self.traindata recommendlist = dict() User_Itemlist = train.get(user, {}) for i, ri in User_Itemlist.items(): for j, wij in sorted(self.ItemSimlist[i].items(), key=lambda x: x[1], reverse=True)[0:k]: if j in User_Itemlist: continue recommendlist.setdefault(j, 0) recommendlist[j] += float(ri) * wij return dict(sorted(recommendlist.items(), key=lambda x: x[1], reverse=True)[0:nitem]) def recallAndPrecision(self, train=None, test=None, k=5, nitem=10): train = train or self.traindata test = test or self.testdata hit = 0 recall = 0 precision = 0 for user in train.keys(): tu = test.get(user, {}) rank = self.recommend(user, train=train, k=k, nitem=nitem) for item, _ in rank.items(): if item in tu: hit += 1 recall += len(tu) precision += nitem return (hit / (recall * 1.0), hit / (precision * 1.0)) def coverage(self, train=None, test=None, k=5, nitem=10): train = train or self.traindata test = test or self.testdata recommend_items = set() all_items = set() for user in train.keys(): for item in train[user].keys(): all_items.add(item) rank = self.recommend(user, train, k=k, nitem=nitem) for item, _ in rank.items(): recommend_items.add(item) return len(recommend_items) / (len(all_items) * 1.0) def popularity(self, train=None, test=None, k=5, nitem=10): train = train or self.traindata test = test or self.testdata item_popularity = dict() for user, items in train.items(): for item in items.keys(): item_popularity.setdefault(item, 0) item_popularity[item] += 1 ret = 0 n = 0 for user in train.keys(): rank = self.recommend(user, train, k=k, nitem=nitem) for item, _ in rank.items(): if item in item_popularity: ret += math.log(1 + item_popularity[item]) n += 1 return ret / (n * 1.0) def testKNNCF(): path = 'C:...' train_filmname = '...' test_filmname = '...' train_data_df = pd.read_csv(path+train_filmname,sep=',') test_data_df = pd.read_csv(path+test_filmname,sep=',') Train_data = train_data_df Test_data = test_data_df cf = KNN(Train_data, Test_data) cf.ItemSim() print("%3s%20s%20s%20s%20s%20s" % ('K', "precision", 'recall', 'coverage', 'popularity', 'F')) # print("%3s%20s%20s%20s" % ('K', "precision", 'recall', 'F')) for k in [5, 10, 20, 40, 80, 160, 200, 240, 300]: recall, precision = cf.recallAndPrecision(k=k) F = 2*precision*recall/(precision+recall) coverage = cf.coverage(k=k) popularity = cf.popularity(k=k) # print("%3d%19.3f%%%19.3f%%%20.3f" % (k, precision * 100, recall * 100, F)) print("%3d%19.3f%%%19.3f%%%19.3f%%%20.3f%%%20.3f" % (k, precision * 100, recall * 100, coverage * 100, popularity*100, F)) if __name__ == "__main__": testKNNCF()
推荐系统----第三章(基于物品的协同过滤算法)
最新推荐文章于 2024-08-12 14:30:51 发布