1.SimpleTagBased:
1.下载数据
数据集自己设定的
A 一曲相思 流行
A 生僻字 流行
A 最后的莫西干人 纯音乐
A 倩女幽魂 经典
B 故乡的原风景 纯音乐
B 生僻字 流行
B 故乡的原风景 纯音乐
C 倩女幽魂 经典
C 海阔天空 经典
D 海阔天空 经典
A 突然好想你 寂寞
C 走西口 民歌
D 走西口 民歌
B 重头再来 励志
D 倩女幽魂 经典
C 重头再来 励志
D 最后的莫西干人 纯音乐
2.计算nu,b nb,i
3.排序
def load_data(file_path):
records = []
f = open(file_path, "r", encoding="utf-8")
for l in f:
ls = l.split()
records.append(ls)
# print(records)
return records
def InitStat(records):
user_tags = dict() # 用户打过标签的次数
tag_items = dict() # 音乐被打过标签的次数,代表歌曲流行度
for user, item, tag in records:
user_tags.setdefault(user, dict())
user_tags[user].setdefault(tag, 0)
user_tags[user][tag] += 1
tag_items.setdefault(tag, dict())
tag_items[tag].setdefault(item, 0)
tag_items[tag][item] += 1
print("用户打过标签的次数: ", user_tags)
print("音乐打过标签的次数: ", tag_items)
return user_tags, tag_items
def Recommend(user_tags, tag_items,user, K):
recommend_items = dict()
for tag, wut in user_tags[user].items():
for item, wti in tag_items[tag].items():
if item not in recommend_items:
recommend_items[item] = wut * wti # 计算用户对物品兴趣度
else:
recommend_items[item] += wut * wti
rec = sorted(recommend_items.items(), key=lambda x: x[1], reverse=True) # 将推荐歌曲按兴趣度排名
print("用户对歌曲兴趣度: ", rec)
print('--------------------------------------')
music = []
# print(rec[0][0])
print(type(rec[0][0]))
for i in range(K):
music.append(rec[i][0])
music = "****".join(music)
print("为用户推荐歌曲: ", music)
return music
if __name__ =='__main__':
file_path = '../data/123.txt'
records = load_data(file_path)
users_tags , tags_items = InitStat(records)
Recommend(users_tags,tags_items,'A',3)
2. TagBasedTFIDF
一样的操作,只是公式不一样
from math import *
def load_data(file_path):
records = []
f = open(file_path, "r", encoding="utf-8")
for l in f:
ls = l.split()
records.append(ls)
# print(records)
return records
def InitStat_update(records):
user_tags = dict() # 用户打过标签的次数
tag_items = dict() # 音乐被打过标签的次数,代表歌曲流行度
tag_user = dict() # 标签被用户标记次数
for user, item, tag in records:
user_tags.setdefault(user, dict())
user_tags[user].setdefault(tag, 0)
user_tags[user][tag] += 1
tag_items.setdefault(tag, dict())
tag_items[tag].setdefault(item, 0)
tag_items[tag][item] += 1
tag_user.setdefault(tag, dict())
tag_user[tag].setdefault(user, 0)
tag_user[tag][user] += 1
print("用户打过标签的次数: ", user_tags)
print("音乐打过标签的次数: ", tag_items)
print("标签被用户使用次数: ", tag_user)
return user_tags, tag_items, tag_user
def Recommend_update(user_tags, tag_items, tag_user,user, K):
recommend_items = dict()
for tag, wut in user_tags[user].items():
for item, wti in tag_items[tag].items():
if item not in recommend_items:
recommend_items[item] = wut * wti / log(1 + len(tag_user[tag])) # 计算用户对物品兴趣度
else:
recommend_items[item] += wut * wti / log(1 + len(tag_user[tag]))
rec = sorted(recommend_items.items(), key=lambda x: x[1], reverse=True) # 将推荐歌曲按兴趣度排名
print("用户对歌曲兴趣度", rec)
music = []
for i in range(K):
music.append(rec[i][0])
music = "/".join(music)
print("为用户推荐歌曲: ", music)
return music
if __name__ =='__main__':
file_path = '../data/123.txt'
records = load_data(file_path)
user_tags, tag_items, tag_user = InitStat_update(records)
Recommend_update(user_tags, tag_items, tag_user,'A',3)
.3.TagBasedTFIDF++
用delicious,dat文件做实验
import random
# 统计各类数量
def addValueToMat(theMat, key, value, incr):
if key not in theMat: # 如果key没出先在theMat中
theMat[key] = dict()
theMat[key][value] = incr
else:
if value not in theMat[key]:
theMat[key][value] = incr
else:
theMat[key][value] += incr # 若有值,则递增
user_tags = dict()
tag_items = dict()
user_items = dict()
user_items_test = dict() # 测试集数据字典
# 初始化,进行各种统计
def InitStat():
data_file = open('../data/delicious.txt','r',encoding='ISO-8859-1')
line = data_file.readline()
while line:
if random.random() > 0.1: # 将90%的数据作为训练集,剩下10%的数据作为测试集
terms = line.split("\t") # 训练集的数据结构是[user, item, tag]形式
user = terms[0]
item = terms[1]
tag = terms[2]
addValueToMat(user_tags, user, tag, 1)
addValueToMat(tag_items, tag, item, 1)
addValueToMat(user_items, user, item, 1)
line = data_file.readline()
else:
addValueToMat(user_items_test, user, item, 1)
data_file.close()
# 推荐算法
def Recommend(usr):
recommend_list = dict()
tagged_item = user_items[usr] # 得到该用户所有推荐过的物品
for tag_, wut in user_tags[usr].items(): # 用户打过的标签及次数
for item_, wit in tag_items[tag_].items(): # 物品被打过的标签及被打过的次数
if item_ not in tagged_item: # 已经推荐过的不再推荐
if item_ not in recommend_list:
recommend_list[item_] = wut * wit # 根据公式
else:
recommend_list[item_] += wut * wit
return sorted(recommend_list.items(), key=lambda a: a[1], reverse=True)
InitStat()
recommend_list = Recommend("48411")
# print recommend_list
for recommend in recommend_list[:10]: # 兴趣度最高的十个itemid
print(recommend)
得到最后结果
参考:https://blog.csdn.net/keyue123/article/details/86644878
集成所有方法:
并加上一种方法: ExpandTagBased 为不满M个标签的用户,加上标签使之标签总个数==M,再用第一种方法进行计算。
import random
import math
import time
from tqdm import tqdm
# 定义装饰器,监控运行时间
def timmer(func):
def wrapper(*args, **kwargs):
start_time = time.time()
res = func(*args, **kwargs)
stop_time = time.time()
print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))
return res
return wrapper
class Dataset():
def __init__(self, fp):
# fp: data file path
self.data = self.loadData(fp)
@timmer
def loadData(self, fp):
data = [f.strip().split('\t')[:3] for f in open(fp).readlines()[1:]]
new_data = {}
for user, item, tag in data:
if user not in new_data:
new_data[user] = {}
if item not in new_data[user]:
new_data[user][item] = set()
new_data[user][item].add(tag)
ret = []
for user in new_data:
for item in new_data[user]:
ret.append((user, item, list(new_data[user][item])))
return ret
@timmer
def splitData(self, M, k, seed=1):
'''
:params: data, 加载的所有(user, item)数据条目
:params: M, 划分的数目,最后需要取M折的平均
:params: k, 本次是第几次划分,k~[0, M)
:params: seed, random的种子数,对于不同的k应设置成一样的
:return: train, test
'''
# 按照(user, item)作为key进行划分
train, test = [], []
random.seed(seed)
for user, item, tags in self.data:
# 这里与书中的不一致,本人认为取M-1较为合理,因randint是左右都覆盖的
if random.randint(0, M - 1) == k:
test.append((user, item, tags))
else:
train.append((user, item, tags))
# 处理成字典的形式,user->set(items)
def convert_dict(data):
data_dict = {}
for user, item, tags in data:
if user not in data_dict:
data_dict[user] = {}
data_dict[user][item] = tags
return data_dict
return convert_dict(train), convert_dict(test)
class Metric():
def __init__(self, train, test, GetRecommendation):
'''
:params: train, 训练数据
:params: test, 测试数据
:params: GetRecommendation, 为某个用户获取推荐物品的接口函数
'''
self.train = train
self.test = test
self.GetRecommendation = GetRecommendation
self.recs = self.getRec()
# 为test中的每个用户进行推荐
def getRec(self):
recs = {}
for user in self.test:
rank = self.GetRecommendation(user)
recs[user] = rank
return recs
# 定义精确率指标计算方式
def precision(self):
all, hit = 0, 0
for user in self.test:
test_items = set(self.test[user])
rank = self.recs[user]
for item, score in rank:
if item in test_items:
hit += 1
all += len(rank)
return round(hit / all * 100, 2)
# 定义召回率指标计算方式
def recall(self):
all, hit = 0, 0
for user in self.test:
test_items = set(self.test[user])
rank = self.recs[user]
for item, score in rank:
if item in test_items:
hit += 1
all += len(test_items)
return round(hit / all * 100, 2)
# 定义覆盖率指标计算方式
def coverage(self):
all_item, recom_item = set(), set()
for user in self.train:
for item in self.train[user]:
all_item.add(item)
for user in self.test:
rank = self.recs[user]
for item, score in rank:
recom_item.add(item)
return round(len(recom_item) / len(all_item) * 100, 2)
# 定义多样性指标计算方式
def diversity(self):
# 计算item_vec,每个tag的个数
item_tags = {}
for user in self.train:
for item in self.train[user]:
if item not in item_tags:
item_tags[item] = {}
for tag in self.train[user][item]:
if tag not in item_tags[item]:
item_tags[item][tag] = 0
item_tags[item][tag] += 1
# 计算两个item的相似度
def CosineSim(u, v):
ret = 0
for tag in item_tags[u]:
if tag in item_tags[v]:
ret += item_tags[u][tag] * item_tags[v][tag]
nu, nv = 0, 0
for tag in item_tags[u]:
nu += item_tags[u][tag] ** 2
for tag in item_tags[v]:
nv += item_tags[v][tag] ** 2
return ret / math.sqrt(nu * nv)
# 计算Diversity
div = []
for user in self.test:
rank = self.recs[user]
sim, cnt = 0, 0
for u, _ in rank:
for v, _ in rank:
if u == v:
continue
sim += CosineSim(u, v)
cnt += 1
sim = sim / cnt if sim != 0 else 0
div.append(1 - sim)
return sum(div) / len(div)
# 定义新颖度指标计算方式
def popularity(self):
# 计算物品的流行度,为给这个物品打过标签的用户数
item_pop = {}
for user in self.train:
for item in self.train[user]:
if item not in item_pop:
item_pop[item] = 0
item_pop[item] += 1
num, pop = 0, 0
for user in self.test:
rank = self.recs[user]
for item, score in rank:
# 取对数,防止因长尾问题带来的被流行物品所主导
pop += math.log(1 + item_pop[item])
num += 1
return round(pop / num, 6)
def eval(self):
metric = {'Precision': self.precision(),
'Recall': self.recall(),
'Coverage': self.coverage(),
'Diversity': self.diversity(),
'Popularity': self.popularity()}
print('Metric:', metric)
return metric
# 1. 基于热门标签的推荐
def SimpleTagBased(train, N):
'''
:params: train, 训练数据集
:params: N, 超参数,设置取TopN推荐物品数目
:return: GetRecommendation,推荐接口函数
'''
# 统计user_tags和tag_items
user_tags, tag_items = {}, {}
for user in train:
user_tags[user] = {}
for item in train[user]:
for tag in train[user][item]:
if tag not in user_tags[user]:
user_tags[user][tag] = 0
user_tags[user][tag] += 1
if tag not in tag_items:
tag_items[tag] = {}
if item not in tag_items[tag]:
tag_items[tag][item] = 0
tag_items[tag][item] += 1
def GetRecommendation(user):
# 按照打分推荐N个未见过的
if user not in user_tags:
return []
seen_items = set(train[user])
item_score = {}
for tag in user_tags[user]:
for item in tag_items[tag]:
if item in seen_items:
continue
if item not in item_score:
item_score[item] = 0
item_score[item] += user_tags[user][tag] * tag_items[tag][item]
item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
return item_score[:N]
return GetRecommendation
# 2. 改进一:为热门标签加入惩罚项
def TagBasedTFIDF(train, N):
'''
:params: train, 训练数据集
:params: N, 超参数,设置取TopN推荐物品数目
:return: GetRecommendation,推荐接口函数
'''
# 统计user_tags和tag_items
user_tags, tag_items = {}, {}
# 统计标签的热门程度,即打过此标签的不同用户数
tag_pop = {}
for user in train:
user_tags[user] = {}
for item in train[user]:
for tag in train[user][item]:
if tag not in user_tags[user]:
user_tags[user][tag] = 0
user_tags[user][tag] += 1
if tag not in tag_items:
tag_items[tag] = {}
if item not in tag_items[tag]:
tag_items[tag][item] = 0
tag_items[tag][item] += 1
if tag not in tag_pop:
tag_pop[tag] = set()
tag_pop[tag].add(user)
tag_pop = {k: len(v) for k, v in tag_pop.items()}
def GetRecommendation(user):
# 按照打分推荐N个未见过的
if user not in user_tags:
return []
seen_items = set(train[user])
item_score = {}
for tag in user_tags[user]:
for item in tag_items[tag]:
if item in seen_items:
continue
if item not in item_score:
item_score[item] = 0
item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag]
item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
return item_score[:N]
return GetRecommendation
# 3. 改进二:同时也为热门商品加入惩罚项
def TagBasedTFIDF_Improved(train, N):
'''
:params: train, 训练数据集
:params: N, 超参数,设置取TopN推荐物品数目
:return: GetRecommendation,推荐接口函数
'''
# 统计user_tags和tag_items
user_tags, tag_items = {}, {}
# 统计标签和物品的热门程度,即打过此标签的不同用户数,和物品对应的不同用户数
tag_pop, item_pop = {}, {}
for user in train:
user_tags[user] = {}
for item in train[user]:
if item not in item_pop:
item_pop[item] = 0
item_pop[item] += 1
for tag in train[user][item]:
if tag not in user_tags[user]:
user_tags[user][tag] = 0
user_tags[user][tag] += 1
if tag not in tag_items:
tag_items[tag] = {}
if item not in tag_items[tag]:
tag_items[tag][item] = 0
tag_items[tag][item] += 1
if tag not in tag_pop:
tag_pop[tag] = set()
tag_pop[tag].add(user)
tag_pop = {k: len(v) for k, v in tag_pop.items()}
def GetRecommendation(user):
# 按照打分推荐N个未见过的
if user not in user_tags:
return []
seen_items = set(train[user])
item_score = {}
for tag in user_tags[user]:
for item in tag_items[tag]:
if item in seen_items:
continue
if item not in item_score:
item_score[item] = 0
item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag] / item_pop[item]
item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
return item_score[:N]
return GetRecommendation
# 4. 基于标签改进的推荐 为不满M个标签的用户,加上标签使之标签总个数==M,再用第一种方法进行计算。
def ExpandTagBased(train, N, M=20):
'''
:params: train, 训练数据集
:params: N, 超参数,设置取TopN推荐物品数目
:params: M,超参数,设置取TopM的标签填补不满M个标签的用户
:return: GetRecommendation,推荐接口函数
'''
# 1. 计算标签之间的相似度
item_tag = {}
for user in train:
for item in train[user]:
if item not in item_tag:
item_tag[item] = set()
for tag in train[user][item]:
item_tag[item].add(tag)
tag_sim, tag_cnt = {}, {}
for item in item_tag:
for u in item_tag[item]:
if u not in tag_cnt:
tag_cnt[u] = 0
tag_cnt[u] += 1
if u not in tag_sim:
tag_sim[u] = {}
for v in item_tag[item]:
if u == v:
continue
if v not in tag_sim[u]:
tag_sim[u][v] = 0
tag_sim[u][v] += 1
for u in tag_sim:
for v in tag_sim[u]:
tag_sim[u][v] /= math.sqrt(tag_cnt[u] * tag_cnt[v])
# 2. 为每个用户扩展标签
user_tags = {}
for user in train:
if user not in user_tags:
user_tags[user] = {}
for item in train[user]:
for tag in train[user][item]:
if tag not in user_tags[user]:
user_tags[user][tag] = 0
user_tags[user][tag] += 1
expand_tags = {}
for user in user_tags:
if len(user_tags[user]) >= M:
expand_tags[user] = user_tags[user]
continue
# 不满M个的进行标签扩展
expand_tags[user] = {}
seen_tags = set(user_tags[user])
for tag in user_tags[user]:
for t in tag_sim[tag]:
if t in seen_tags:
continue
if t not in expand_tags[user]:
expand_tags[user][t] = 0
expand_tags[user][t] += user_tags[user][tag] * tag_sim[tag][t]
expand_tags[user].update(user_tags[user])
expand_tags[user] = dict(list(sorted(expand_tags[user].items(), key=lambda x: x[1], reverse=True))[:M])
# 3. SimpleTagBased算法
tag_items = {}
for user in train:
for item in train[user]:
for tag in train[user][item]:
if tag not in tag_items:
tag_items[tag] = {}
if item not in tag_items[tag]:
tag_items[tag][item] = 0
tag_items[tag][item] += 1
def GetRecommendation(user):
# 按照打分推荐N个未见过的
if user not in user_tags:
return []
seen_items = set(train[user])
item_score = {}
for tag in expand_tags[user]:
for item in tag_items[tag]:
if item in seen_items:
continue
if item not in item_score:
item_score[item] = 0
item_score[item] += expand_tags[user][tag] * tag_items[tag][item]
item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
return item_score[:N]
return GetRecommendation
class Experiment():
def __init__(self, M, N, fp='../data/delicious-2k/user_taggedbookmarks.dat', rt='SimpleTagBased'):
'''
:params: M, 进行多少次实验
:params: N, TopN推荐物品的个数
:params: fp, 数据文件路径
:params: rt, 推荐算法类型
'''
self.M = M
self.N = N
self.fp = fp
self.rt = rt
self.alg = {'SimpleTagBased': SimpleTagBased, 'TagBasedTFIDF': TagBasedTFIDF, \
'TagBasedTFIDF_Improved': TagBasedTFIDF_Improved, 'ExtendTagBased': ExpandTagBased}
# 定义单次实验
@timmer
def worker(self, train, test):
'''
:params: train, 训练数据集
:params: test, 测试数据集
:return: 各指标的值
'''
getRecommendation = self.alg[self.rt](train, self.N)
metric = Metric(train, test, getRecommendation)
return metric.eval()
# 多次实验取平均
@timmer
def run(self):
metrics = {'Precision': 0, 'Recall': 0,
'Coverage': 0, 'Diversity': 0,
'Popularity': 0}
dataset = Dataset(self.fp)
for ii in range(self.M):
train, test = dataset.splitData(self.M, ii)
print('Experiment {}:'.format(ii))
metric = self.worker(train, test)
metrics = {k: metrics[k] + metric[k] for k in metrics}
metrics = {k: metrics[k] / self.M for k in metrics}
print('Average Result (M={}, N={}): {}'.format( \
self.M, self.N, metrics))
# # 1. SimpleTagBased实验
# M, N = 10, 10
# exp = Experiment(M, N, rt='SimpleTagBased')
# exp.run()
#
# # 2. TagBasedTFIDF实验
# M, N = 10, 10
# exp = Experiment(M, N, rt='TagBasedTFIDF')
# exp.run()
#
# # 3. TagBasedTFIDF++实验
# M, N = 10, 10
# exp = Experiment(M, N, rt='TagBasedTFIDF_Improved')
# exp.run()
# 4. TagExtend实验
M, N = 10, 10
exp = Experiment(M, N, rt='ExtendTagBased')
exp.run()