数据为movielen数据,这里仅用其中的ratings数据
1、数据按年份切分
import re
from datetime import datetime
import time
import pandas as pd
def data_read():
t0 = time.perf_counter()
ratings_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings.csv"
# 文件较大,逐行读
with open(ratings_path) as f:
f.readline()
row = re.split(',', f.readline())
ratings = []
while len(row) > 1:
ratings.append([int(row[0]), int(row[1]), float(row[2]), int(row[3])])
row = re.split(',', f.readline())
print("data_read:", time.perf_counter() - t0)
return ratings
def times_generate():
t0 = time.perf_counter()
times = [str(i) for i in range(1995, 2020)]
times_second = [datetime.strptime(t, "%Y") for t in times]
#秒
times_second = [int(time.mktime(t.timetuple())) for t in times_second]
#毫秒
# times_second = [int(time.mktime(t.timetuple()) * 1000.0 + t.microsecond / 1000.0) for t in times_second]
print("times_generate:", time.perf_counter() - t0)
return times, times_second
def ratings_year_generate(ratings, times_second):
t0 = time.perf_counter()
ratings_year = []
for i in range(len(times_second)-1):
ratings_year.append([row for row in ratings if row[3] >= times_second[i] and row[3] < times_second[i+1]])
ratings_year.append([row for row in ratings if row[3] >= times_second[-1]])
print("ratings_year_generate:", time.perf_counter() - t0)
return ratings_year
#存不完
def data_output_xlsx(path, res, res_name):
t0 = time.perf_counter()
for i in range(len(res)):
writer = pd.ExcelWriter(path+"\\"+res_name[i]+".xlsx")
df = pd.DataFrame(res[i])
df.columns = ["userId", "movieId", "rating", "timestamp"]
df.to_excel(writer, res_name[i])
writer.save()
print("data_output:", time.perf_counter() - t0)
def data_output_csv(path, res, res_name):
t0 = time.perf_counter()
for i in range(len(res)):
f = open(path+"\\"+res_name[i]+".csv", "w")
f.write(",".join(["userId", "movieId", "rating", "timestamp"]) + "\n")
for row in res[i]:
f.write(",".join([str(s) for s in row]) + "\n")
f.close()
print("data_output:", time.perf_counter() - t0)
if __name__ == '__main__':
ratings = data_read()
times, times_second = times_generate()
ratings_year = ratings_year_generate(ratings, times_second)
data_output_csv("D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year", ratings_year, times)
2、数据录入
import time
import re
class Edge:
def __init__(self, user_id, item_id, score):
self.user_id = user_id
self.item_id = item_id
self.score = score
def __repr__(self):
return "[" + str(self.user_id) + ", " + str(self.item_id) + ", " + str(self.score) + "]"
def Edge_read(ratings_train_path, ratings_test_path):
t0 = time.perf_counter()
#文件较大,逐行读
with open(ratings_train_path) as f:
f.readline()
row = re.split(',', f.readline())
ratings_train = []
while len(row) > 1:
ratings_train.append(Edge(int(row[0]), int(row[1]), float(row[2])))
row = re.split(',', f.readline())
with open(ratings_test_path) as f:
f.readline()
row = re.split(',', f.readline())
ratings_test = []
while len(row) > 1:
ratings_test.append(Edge(int(row[0]), int(row[1]), float(row[2])))
row = re.split(',', f.readline())
print("time_data_read:", time.perf_counter() - t0)
return ratings_train, ratings_test
if __name__ == '__main__':
ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
print("len(ratings_train):", len(ratings_train))
print("ratings_train:", ratings_train[:10])
print("len(ratings_test):", len(ratings_test))
print("ratings_test:", ratings_test[:10])
3、贪心、随机、UserCF、ItemCF
from ml_25m.Data_read import *
import random as rd
import math
class parameter_items_popular:
def __init__(self):
self.L = []
def __repr__(self):
return str(self.L)
def TopK(self, p, K):
return self.L[:min(len(self.L), K)]
def train_items_popular(ratings_train, score_is_avg = False):
t0 = time.perf_counter()
D = {}
C = {}
for edge in ratings_train:
if edge.item_id not in D:
D[edge.item_id] = 0
C[edge.item_id] = 0
D[edge.item_id] += edge.score
C[edge.item_id] += 1
if score_is_avg:
D = {k: D[k]/C[k] for k in D}
L = [[k,v] for k,v in D.items()]
L.sort(key=lambda x:x[1], reverse=True)
P = parameter_items_popular()
P.L = [row[0] for row in L]
print("\ntime_train_items_popular:", time.perf_counter() - t0)
return P
class parameter_random:
def __init__(self):
self.L = []
def __repr__(self):
return str(self.L)
def TopK(self, p, K):
return rd.sample(self.L, min(len(self.L), K))
def train_random(ratings_train):
t0 = time.perf_counter()
S = set()
for edge in ratings_train:
S.add(edge.item_id)
P = parameter_random()
P.L = rd.shuffle(list(S))
print("\ntime_train_random:", time.perf_counter() - t0)
return P
class parameter_UserCF:
def __init__(self):
self.L = []
self.D = {}
def __repr__(self):
return str(self.L)
def TopK(self, p, K):
if p in self.D:
return self.D[p][:min(len(self.D[p]), K)]
return self.L[:min(len(self.L), K)]
def train_UserCF(ratings_train, is_absolute):
t0 = time.perf_counter()
F = {}
for edge in ratings_train:
if edge.user_id not in F:
F[edge.user_id] = {}
if edge.item_id not in F[edge.user_id]:
F[edge.user_id][edge.item_id] = edge.score
F_UI = {}
for edge in ratings_train:
if edge.user_id not in F_UI:
F_UI[edge.user_id] = set()
F_UI[edge.user_id].add(edge.item_id)
#选择最近的十个邻居,按评分由高到低给
D = {}
for p in F_UI:
if is_absolute:
D_p = {p1:len(F_UI[p]&F_UI[p1]) for p1 in F_UI}
else:
D_p = {p1: len(F_UI[p] & F_UI[p1]) / len(F_UI[p1]) for p1 in F_UI}
pair = [[k, v] for k, v in D_p.items()]
pair.sort(key=lambda x:x[1], reverse=True)
N = [e[0] for e in pair[:min(len(pair), 11)]]
C = {}
for p1 in N:
for item in F[p1]:
if item not in C:
C[item] = 0
C[item] += F[p1][item]
pair1 = [[k, v] for k, v in C.items()]
pair1.sort(key=lambda x: x[1], reverse=True)
R = [e[0] for e in pair1 if e[0] not in F_UI[p]]
D[p] = R
D1 = {}
for edge in ratings_train:
if edge.item_id not in D1:
D1[edge.item_id] = 0
D1[edge.item_id] += edge.score
L = [[k,v] for k,v in D1.items()]
L.sort(key=lambda x:x[1], reverse=True)
P = parameter_UserCF()
P.L = [row[0] for row in L]
P.D = D
print("\ntrain_UserCF:", time.perf_counter() - t0)
return P
class parameter_ItemCF:
def __init__(self):
self.L = []
self.D = {}
def __repr__(self):
return str(self.L)
def TopK(self, p, K):
if p in self.D:
return self.D[p][:min(len(self.D[p]), K)]
return self.L[:min(len(self.L), K)]
def train_ItemCF(ratings_train):
t0 = time.perf_counter()
F_IU = {}
for edge in ratings_train:
if edge.item_id not in F_IU:
F_IU[edge.item_id] = set()
F_IU[edge.item_id].add(edge.user_id)
C = {item:{item1:len(F_IU[item]&F_IU[item1])/len(F_IU[item]|F_IU[item1]) for item1 in F_IU} for item in F_IU}
F_UI = {}
for edge in ratings_train:
if edge.user_id not in F_UI:
F_UI[edge.user_id] = set()
F_UI[edge.user_id].add(edge.item_id)
D = {}
for p in F_UI:
count = {item:0 for item in C}
for item in F_UI[p]:
for item1 in C[item]:
count[item1] += C[item][item1]
for item in F_UI[p]:
count[item] = -1
pairs = [[k, v] for k, v in count.items()]
pairs.sort(key=lambda x:x[1], reverse=True)
D[p] = [p[0] for p in pairs]
D1 = {}
for edge in ratings_train:
if edge.item_id not in D1:
D1[edge.item_id] = 0
D1[edge.item_id] += edge.score
L = [[k,v] for k,v in D1.items()]
L.sort(key=lambda x:x[1], reverse=True)
P = parameter_UserCF()
P.L = [row[0] for row in L]
P.D = D
print("\ntrain_ItemCF:", time.perf_counter() - t0)
return P
if __name__ == '__main__':
ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
# P = train_items_popular(ratings_train, score_is_avg = False)
# print(P.L)
# P = train_random(ratings_train)
# print(P.L)
# print(P.TopK(p=1, K=10))
# print(P.TopK(p=1, K=10))
#2058s
# P = train_UserCF(ratings_train, is_absolute=False)
P = train_ItemCF(ratings_train)
4、一些常用的评价指标
from ml_25m.TopN.Data_train import *
import math
def DCG(r, i):
return (pow(2, r)-1)/math.log2(i+1)
def F_generate(ratings):
F = {}
for edge in ratings:
if edge.user_id not in F:
F[edge.user_id] = {}
if edge.item_id not in F[edge.user_id]:
F[edge.user_id][edge.item_id] = edge.score
return F
def L_personalized_generation(F, P, p, K, is_sup):
if is_sup:
L = [[k, v] for k, v in F[p].items()]
L.sort(key=lambda x: x[1], reverse=True)
L = [row[0] for row in L]
L = L[:min(len(L), K)]
else:
L = P.TopK(p, K)
return L
#1-CG:cumalative gain
def test_count_score_CG(ratings_test, P, K, is_print=False, is_sup=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = L_personalized_generation(F, P, p, K, is_sup)
score_tot += sum([F[p][item] for item in res_P if item in F[p]])
if is_print:
print("\nscore_CG:", score_tot/len(F))
print("time_test_count_score_CG:", time.perf_counter() - t0)
return score_tot/len(F)
#2-DCG:discount CG
def test_count_score_DCG(ratings_test, P, K, is_print=False, is_sup=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = L_personalized_generation(F, P, p, K, is_sup)
score_tot += sum([DCG(F[p][res_P[i]], i + 1) for i in range(len(res_P)) if res_P[i] in F[p]])
if is_print:
print("\nscore_DCG:", score_tot / len(F))
print("time_test_count_score_DCG:", time.perf_counter() - t0)
return score_tot/len(F)
#IDCG:ideal DCG
#3-NDCG:normalized DCG
def test_count_score_NDCG(ratings_test, P, K, is_print=False, is_sup=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = L_personalized_generation(F, P, p, K, is_sup)
L_ori = [F[p][res_P[i]] if res_P[i] in F[p] else 0 for i in range(len(res_P))]
score_ori = sum([DCG(L_ori[i], i + 1) for i in range(len(L_ori))])
L_ideal = sorted(L_ori, reverse=True)
score_ideal = sum([DCG(L_ideal[i], i + 1) for i in range(len(L_ideal))])
if score_ideal > 0:
score_tot += score_ori / score_ideal
if is_print:
print("\nscore_NDCG:", score_tot / len(F))
print("time_test_count_score_NDCG:", time.perf_counter() - t0)
return score_tot/len(F)
#下面两个指标与同时出现有关,与分数无关
#4-precision
def test_count_score_precision(ratings_test, P, K, is_print=False, is_sup=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = L_personalized_generation(F, P, p, K, is_sup)
score_tot += sum([1 for item in res_P if item in F[p]]) / K
if is_print:
print("\nscore_precision:", score_tot / len(F))
print("time_test_count_score_precision:", time.perf_counter() - t0)
return score_tot/len(F)
#5-recall
def test_count_score_recall(ratings_test, P, K, is_print=False, is_sup=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = L_personalized_generation(F, P, p, K, is_sup)
score_tot += sum([1 for item in res_P if item in F[p]]) / len(F[p])
if is_print:
print("\nscore_recall:", score_tot / len(F))
print("time_test_count_score_recall:", time.perf_counter() - t0)
return score_tot/len(F)
#6-K_Call
def test_count_score_K_Call(ratings_test, P, K, is_print=False, is_sup=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = L_personalized_generation(F, P, p, K, is_sup)
K_K_Call = 10
I = sum([1 for i in range(len(res_P)) if res_P[i] in F[p]])
if I >= K_K_Call:
score_tot += 1
if is_print:
print("\nscore_K_Call:", score_tot / len(F))
print("\ntime_test_count_score_K_Call:", time.perf_counter() - t0)
return score_tot/len(F)
def items_generate(ratings):
items = {}
for edge in ratings:
if edge.item_id not in items:
items[edge.item_id] = set()
items[edge.item_id].add(edge.user_id)
return items
def correlation_generate(items):
C = {}
for i in items:
for j in items:
if (i, j) not in C:
U = items[i]|items[j]
I = items[i]&items[j]
C[i, j] = C[j, i] = len(I)/len(U)
return C
#7-IntraSim
def test_count_diversity_IntraSim(ratings_test, P, K, is_print=False):
t0 = time.perf_counter()
items = items_generate(ratings_test)
C = correlation_generate(items)
F = F_generate(ratings_test)
score_tot = 0
for p in F:
res_P = P.TopK(p, K)
if K > 1:
score_tot += sum([C[res_P[i], res_P[j]] for i in range(K) for j in range(K) if i != j]) / 2 / K / (K - 1)
if is_print:
print("\nscore_test_count_diversity_IntraSim:", score_tot / len(F))
print("time_test_count_diversity_IntraSim:", time.perf_counter() - t0)
return score_tot / len(F)
#8-InterDiv
def test_count_diversity_InterDiv(ratings_test, P, K, is_print=False):
t0 = time.perf_counter()
F = F_generate(ratings_test)
score_tot = 0
L_res = [set(P.TopK(p, K)) for p in F]
for i in range(len(L_res) - 1):
for j in range(i + 1, len(L_res)):
score_tot += (K - len(L_res[i] & L_res[j])) / K
score_tot = score_tot/len(F)/(len(F)-1)
if is_print:
print("\nscore_test_count_diversity_InterDiv:", score_tot)
print("time_test_count_diversity_InterDiv:", time.perf_counter() - t0)
return score_tot
#9-COV
def test_count_diversity_COV(ratings_test, P, K, is_print=False):
t0 = time.perf_counter()
items = items_generate(ratings_test)
F = F_generate(ratings_test)
all = set()
for p in F:
all = all | set(P.TopK(p, K))
score_tot = len(all) / len(items)
if is_print:
print("\nscore_test_count_diversity_COV:", score_tot)
print("time_test_count_diversity_COV:", time.perf_counter() - t0)
return score_tot
if __name__ == '__main__':
ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
P = train_items_popular(ratings_train, score_is_avg = False)
score_CG = test_count_score_CG(ratings_test, P, K=10, is_print=True)
score_sup_CG = test_count_score_CG(ratings_test, P, K=10, is_print=False, is_sup=True)
print("score_sup_CG:", score_sup_CG)
# score_DCG = test_count_score_DCG(ratings_test, P, K=100, is_print=True)
# score_sup_DCG = test_count_score_DCG(ratings_test, P, K=100, is_print=False, is_sup=True)
# print("score_sup_DCG:", score_sup_CG)
#
# score_NDCG = test_count_score_NDCG(ratings_test, P, K=100, is_print=True)
# score_sup_NDCG = test_count_score_NDCG(ratings_test, P, K=100, is_print=False, is_sup=True)
# print("score_sup_NDCG:", score_sup_NDCG)
#
# score_precision = test_count_score_precision(ratings_test, P, K=100, is_print=True)
# score_sup_precision = test_count_score_precision(ratings_test, P, K=100, is_print=False, is_sup=True)
# print("score_sup_precision:", score_sup_precision)
#
# score_recall = test_count_score_recall(ratings_test, P, K=100, is_print=True)
# score_sup_recall = test_count_score_recall(ratings_test, P, K=100, is_print=False, is_sup=True)
# print("score_sup_recall:", score_sup_recall)
#
# score_K_Call = test_count_score_K_Call(ratings_test, P, K=100, is_print=True)
# score_sup_K_Call = test_count_score_K_Call(ratings_test, P, K=100, is_print=False, is_sup=True)
# print("score_sup_K_Call:", score_sup_K_Call)
#
# score_intraSim = test_count_diversity_IntraSim(ratings_test, P, K=100, is_print=True)
# score_InterDiv = test_count_diversity_InterDiv(ratings_test, P, K=100, is_print=True)
# score_COV = test_count_diversity_COV(ratings_test, P, K=100, is_print=True)
5、评估贪心、随机、UserCF、ItemCF四种方法在不同K取值下的表现,这里评价指标采用准确率、召回率和覆盖度
from ml_25m.TopN.Data_test import *
def normal_num(num, n):
a = int(pow(10, n))
return round(num*a)/a
def test_and_print(P, ratings_test):
L_precision = []
L_recall = []
L_COV = []
for K in [5, 10, 20, 40, 80, 160]:
score_precision = test_count_score_precision(ratings_test, P, K=K)
score_recall = test_count_score_recall(ratings_test, P, K=K)
score_COV = test_count_diversity_COV(ratings_test, P, K=K)
L_precision.append(score_precision)
L_recall.append(score_recall)
L_COV.append(score_COV)
L_precision = [normal_num(num, 2) for num in L_precision]
L_recall = [normal_num(num, 2) for num in L_recall]
L_COV = [normal_num(num, 4) for num in L_COV]
print("\nL_precision: \n", L_precision)
print("\nL_recall: \n", L_recall)
print("\nL_COV: \n", L_COV)
if __name__ == '__main__':
ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
#1-items_popular
P = train_items_popular(ratings_train, score_is_avg = False)
test_and_print(P, ratings_test)
#
# #2-random
# P = train_random(ratings_train)
# test_and_print(P, ratings_test)
#
# #3-UserCF
# P = train_UserCF(ratings_train, is_absolute=True)
# test_and_print(P, ratings_test)
# P = train_UserCF(ratings_train, is_absolute=False)
# test_and_print(P, ratings_test)
# 4-itemCF
# P = train_ItemCF(ratings_train)
# test_and_print(P, ratings_test)
"""
K 5 10 20 40 80 160
precision items_popular 0.17 0.16 0.14 0.12 0.16 0.13
random 0.04 0.04 0.04 0.04 0.04 0.04
UserCF_abs 0.19 0.17 0.15 0.13 0.16 0.13
UserCF 0.18, 0.17, 0.14, 0.12, 0.15, 0.13
itemCF 0.18, 0.17, 0.14, 0.13, 0.16, 0.14
recall items_popular 0.01 0.02 0.03 0.05 0.22 0.39
random 0 0.01 0.01 0.03 0.05 0.11
UserCF_abs 0.01 0.02 0.04 0.07 0.23 0.39
UserCF 0.01 0.02 0.03 0.06 0.21 0.35
itemCF 0.01, 0.02, 0.04, 0.06, 0.23, 0.4
COV items_popular 0.003 0.006 0.012 0.0239 0.0478 0.0957
random 0.8272 0.8272 0.8272 0.8272 0.8272 0.8272
UserCF_abs 0.1818 0.2428 0.326 0.4234 0.5167 0.6136
UserCF 0.1519 0.2339 0.3278 0.4276 0.4725 0.4791
itemCF 0.128, 0.1669, 0.2279, 0.314, 0.4061, 0.5042
"""
由于准确率反应的是推荐列表中用户感兴趣的商品比例,其与K成负相关,这里平均准确率:UserCF_abs > itemCF > UserCF > items_popular > random
召回率反应的是推荐列表占用户全部感兴趣商品的比例,其与K成正相关,这里平均召回率:UserCF_abs > itemCF > items_popular > UserCF > random
覆盖度反应的是所有推荐结果占总商品的比例,其与K成正相关,这里平均覆盖度:random > UserCF_abs > UserCF > itemCF > items_popular