import random
import math
class ItemCFMovies:
def __init__(self, dataFilePath, splitDataProportion):
self.dataFilePath = dataFilePath
self.splitDataProportion = splitDataProportion
self.data = self.loadData()
self.train, self.test = self.splitData(3, 47)
self.itemSim = self.itemSimilarity()
def loadData(self):
print("加载数据中.....")
data = []
for line in open(self.dataFilePath):
userId, itemId, record, _ = line.split('::')
data.append((userId, itemId, int(record)))
print("加载数据结束.....")
return data
'''
拆分数据集和测试集
k: 参数
seed: 生成随机数的种子
M: 随机数上限
'''
def splitData(self, k, seed, M = 9):
print("数据集切分中.....")
train, test = {}, {}
random.seed(seed)
for user, item, record in self.data:
if random.randint(0, M) == k:
test.setdefault(user, {})
test[user][item] = record
else:
train.setdefault(user, {})
train[user][item] = record
print("数据集切分结束.....")
return train, test
def itemSimilarity(self):
print("相似度计算中.....")
item_action_user_count = {}
count_matrix = {}
item_sim = {}
for user, item in self.train.items():
for i in item.keys():
item_action_user_count.setdefault(i, 0)
if self.train[user][i] > 0.0:
item_action_user_count[i] += 1
for j in item.keys():
count_matrix.setdefault(i, {}).setdefault(j, 0)
if (
self.train[user][i] > 0
and self.train[user][j] > 0
and i != j
):
count_matrix[i][j] += 1
for i, related_items in count_matrix.items():
item_sim.setdefault(i, {})
for j, related_items_count in related_items.items():
item_sim[i].setdefault(j, 0)
item_sim[i][j] = round(count_matrix[i][j] / math.sqrt( item_action_user_count[i] * item_action_user_count[j] ), 2)
print("相似度计算结束.....")
return item_sim
'''
计算推荐结果
user: 用户
k: k个临近物品
itemCount: 总共返回itemCount个物品
'''
def recommend(self, user, k = 8, itemCount = 40):
result = {}
u_items = self.train.get(user, {})
for i, pi in u_items.items():
for j, wj in sorted(self.itemSim[i].items(), key=lambda x: x[1], reverse=True)[0: k]:
if j in u_items:
continue
result.setdefault(j, 0)
result[j] += pi * wj
return dict(sorted(result.items(), key=lambda x: x[1], reverse=True)[0: itemCount])
def precision(self, k = 8, itemCount = 40):
print("准确率计算中.....")
hit,all = 0,0
for user in self.train.keys():
Tu = self.test.get(user, {})
result = self.recommend(user, k = k, itemCount = itemCount)
for item, _ in result.items():
if item in Tu:
hit += 1
all += itemCount
print("准确率计算结束.....")
return round(hit / (all * 1.0), 2)
def recall(self, k = 8, itemCount = 40):
print("召回率计算中.....")
hit, all = 0, 0
for user in self.train.keys():
Tu = self.test.get(user, {})
result = self.recommend(user, k=k, itemCount=itemCount)
for item, _ in result.items():
if item in Tu:
hit += 1
all += len(Tu)
print("召回率计算结束.....")
return round(hit / (all * 1.0), 2)
def coverage(self, k = 8, itemCount = 40):
print("覆盖率计算中.....")
recommend_count, all_count = set(), set()
for user in self.train.keys():
for item in self.train[user].keys():
all_count.add(item)
result = self.recommend(user, k = k, itemCount = itemCount)
for item, _ in result.items():
recommend_count.add(item)
print("覆盖率计算结束.....")
return round(len(recommend_count) / len(all_count) * 1.0, 2)
def popularity(self, k = 8, itemCount = 40):
print("新颖度计算中.....")
item_popularity = {}
popularity, n = 0,0
for _, items in self.train.items():
for item in items.keys():
if item not in item_popularity:
item_popularity[item] = 0
item_popularity[item] += 1
for user in self.train.keys():
result = self.recommend(user, k, itemCount)
for i, _ in result.items():
popularity += math.log(1 + item_popularity[i])
n += 1
print("新颖度计算结束.....")
return round(popularity / n * 1.0, 2)
if __name__ == '__main__':
it = ItemCFMovies('../data/ratings.dat', [1, 9])
list = [5, 15, 20, 25]
print("=====================================第一轮list = [5, 15, 20, 25]计算开始=====================================")
for i in list:
print("=======================k = {0}, itemCount = {1}===================================".format(i, i))
precision = it.precision(i, i)
recall = it.recall(i, i)
coverage = it.coverage(i, i)
popularity = it.popularity(i, i)
print("推荐的准确率 {}".format(precision))
print("推荐的召回率 {}".format(recall))
print("推荐的覆盖率 {}".format(coverage))
print("推荐的新颖度 {}".format(popularity))
print("==========================================================")
print("=====================================第一轮list = [5, 15, 20, 25]计算结束=====================================")
'''
k = 5 itemCount = 40 0.11 0.26 0.48 6.94
k = 10 itemCount = 40 0.11 0.27 0.39 7.05
k = 15 itemCount = 40 0.11 0.27 0.35 7.12
k = 30 itemCount = 40 0.11 0.26 0.30 7.21
k = 40 itemCount = 40 0.11 0.26 0.28 7.24
k = 50 itemCount = 40 0.11 0.25 0.28 7.26
k = 65 itemCount = 40 0.10 0.25 0.26 7.28
k = 80 itemCount = 40 0.10 0.24 0.25 7.29
k = 120 itemCount = 40 0.10 0.23 0.23 7.3
k = 5 itemCount = 5 0.22 0.07 0.14 7.33
k = 10 itemCount = 10 0.19 0.11 0.19 7.32
k = 15 itemCount = 15 0.16 0.15 0.21 7.32
k = 20 itemCount = 20 0.15 0.18 0.23 7.30
k = 25 itemCount = 25 0.13 0.20 0.25 7.28
k = 30 itemCount = 30 0.12 0.22 0.26 7.27
k = 50 itemCount = 50 0.10 0.29 0.30 7.20
k = 80 itemCount = 80 0.07 0.36 0.34 7.12
k = 120 itemCount = 120 0.06 0.43 0.37 7.03
'''
list1 = [5, 10, 15, 20, 40, 60, 80, 120]
print("=====================================第二轮list1 = [5, 10, 15, 20, 40, 60, 80, 120]计算开始=====================================")
for i in list1:
print("=======================k = {0}, itemCount = 5===================================".format(i))
precision = it.precision(i, 5)
recall = it.recall(i, 5)
coverage = it.coverage(i, 5)
popularity = it.popularity(i, 5)
print("推荐的准确率 {}".format(precision))
print("推荐的召回率 {}".format(recall))
print("推荐的覆盖率 {}".format(coverage))
print("推荐的新颖度 {}".format(popularity))
print("==========================================================")
print("=====================================第二轮list1 = [5, 10, 15, 20, 40, 60, 80, 120]计算结束=====================================")
'''
k = 5 itemCount = 5 0.22 0.07 0.14 7.33
k = 10 itemCount = 5 0.22 0.07 0.13 7.4
k = 15 itemCount = 5 0.22 0.07 0.12 7.45
k = 20 itemCount = 5 0.22 0.07 0.12 7.47
k = 40 itemCount = 5 0.22 0.07 0.11 7.51
k = 60 itemCount = 5 0.22 0.06 0.10 7.52
k = 80 itemCount = 5 0.21 0.06 0.10 7.51
k = 120 itemCount = 5 0.20 0.06 0.09 7.5
'''
list2 = [5, 10, 15, 20, 40, 60, 80, 120]
print("=====================================第三轮list2 = [5, 10, 15, 20, 40, 60, 80, 120]计算开始=====================================")
for i in list2:
print("=======================k = {0}, itemCount = 10===================================".format(i))
precision = it.precision(i, 10)
recall = it.recall(i, 10)
coverage = it.coverage(i, 10)
popularity = it.popularity(i, 10)
print("推荐的准确率 {}".format(precision))
print("推荐的召回率 {}".format(recall))
print("推荐的覆盖率 {}".format(coverage))
print("推荐的新颖度 {}".format(popularity))
print("==========================================================")
print("=====================================第三轮list2 = [5, 10, 15, 20, 40, 60, 80, 120]计算结束=====================================")
'''
k = 5 itemCount = 10 0.18 0.11 0.21 7.23
k = 10 itemCount = 10 0.19 0.11 0.19 7.32
k = 15 itemCount = 10 0.19 0.11 0.17 7.38
k = 20 itemCount = 10 0.18 0.11 0.16 7.4
k = 40 itemCount = 10 0.18 0.11 0.15 7.45
k = 60 itemCount = 10 0.18 0.11 0.14 7.46
k = 80 itemCount = 10 0.17 0.10 0.13 7.46
k = 120 itemCount = 10 0.17 0.10 0.12 7.45
'''
list3 = [5, 10, 15, 20, 40, 60, 80, 120]
print("=====================================第四轮list3 = [5, 10, 15, 20, 40, 60, 80, 120]计算开始=====================================")
for i in list3:
print("=======================k = {0}, itemCount = 15===================================".format(i))
precision = it.precision(i, 15)
recall = it.recall(i, 15)
coverage = it.coverage(i, 15)
popularity = it.popularity(i, 15)
print("推荐的准确率 {}".format(precision))
print("推荐的召回率 {}".format(recall))
print("推荐的覆盖率 {}".format(coverage))
print("推荐的新颖度 {}".format(popularity))
print("==========================================================")
print("=====================================第四轮list3 = [5, 10, 15, 20, 40, 60, 80, 120]计算结束=====================================")
'''
k = 5 itemCount = 15 0.16 0.14 0.26 7.16
k = 10 itemCount = 15 0.16 0.15 0.23 7.26
k = 15 itemCount = 15 0.16 0.15 0.21 7.32
k = 20 itemCount = 15 0.16 0.15 0.20 7.34
k = 40 itemCount = 15 0.16 0.14 0.18 7.4
k = 60 itemCount = 15 0.15 0.14 0.17 7.42
k = 80 itemCount = 15 0.15 0.13 0.16 7.43
k = 120 itemCount = 15 0.14 0.13 0.14 7.42
'''
list4 = [5, 10, 15, 20, 40, 60, 80, 120]
print("=====================================第五轮list4 = [5, 10, 15, 20, 40, 60, 80, 120]计算开始=====================================")
for i in list4:
print("=======================k = {0}, itemCount = 20===================================".format(i))
precision = it.precision(i, 20)
recall = it.recall(i, 20)
coverage = it.coverage(i, 20)
popularity = it.popularity(i, 20)
print("推荐的准确率 {}".format(precision))
print("推荐的召回率 {}".format(recall))
print("推荐的覆盖率 {}".format(coverage))
print("推荐的新颖度 {}".format(popularity))
print("==========================================================")
print("=====================================第五轮list4 = [5, 10, 15, 20, 40, 60, 80, 120]计算结束=====================================")
'''
k = 5 itemCount = 20 0.14 0.17 0.31 7.11
k = 10 itemCount = 20 0.15 0.18 0.27 7.21
k = 15 itemCount = 20 0.15 0.18 0.25 7.27
k = 20 itemCount = 20 0.15 0.18 0.23 7.3
k = 40 itemCount = 20 0.14 0.17 0.21 7.36
k = 60 itemCount = 20 0.14 0.16 0.19 7.39
'''
'''
从数据结果显示
k = [40, 60, 80] itemCount = 5
k为40,60,80的时候比较适合
itemCount为5的时候比较适合
'''