65、Top-N推荐系统的协同过滤方法

最新推荐文章于 2023-12-29 15:29:08 发布

chaoyuzhang

最新推荐文章于 2023-12-29 15:29:08 发布

阅读量1.4k

点赞数

分类专栏：运筹优化、机器学习基础算法实现

本文链接：https://blog.csdn.net/chaoyuzhang/article/details/110394147

版权

运筹优化、机器学习基础算法实现专栏收录该内容

42 篇文章 9 订阅

订阅专栏

数据为movielen数据，这里仅用其中的ratings数据

1、数据按年份切分

import re
from datetime import datetime
import time
import pandas as pd

def data_read():

    t0 = time.perf_counter()

    ratings_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings.csv"

    # 文件较大，逐行读
    with open(ratings_path) as f:
        f.readline()
        row = re.split(',', f.readline())
        ratings = []
        while len(row) > 1:
            ratings.append([int(row[0]), int(row[1]), float(row[2]), int(row[3])])
            row = re.split(',', f.readline())

    print("data_read:", time.perf_counter() - t0)

    return ratings

def times_generate():
    t0 = time.perf_counter()
    times = [str(i) for i in range(1995, 2020)]
    times_second = [datetime.strptime(t, "%Y") for t in times]
    #秒
    times_second = [int(time.mktime(t.timetuple())) for t in times_second]
    #毫秒
    # times_second = [int(time.mktime(t.timetuple()) * 1000.0 + t.microsecond / 1000.0) for t in times_second]
    print("times_generate:", time.perf_counter() - t0)
    return times, times_second

def ratings_year_generate(ratings, times_second):
    t0 = time.perf_counter()
    ratings_year = []
    for i in range(len(times_second)-1):
        ratings_year.append([row for row in ratings if row[3] >= times_second[i] and row[3] < times_second[i+1]])
    ratings_year.append([row for row in ratings if row[3] >= times_second[-1]])
    print("ratings_year_generate:", time.perf_counter() - t0)
    return ratings_year

#存不完
def data_output_xlsx(path, res, res_name):
    t0 = time.perf_counter()
    for i in range(len(res)):
        writer = pd.ExcelWriter(path+"\\"+res_name[i]+".xlsx")
        df = pd.DataFrame(res[i])
        df.columns = ["userId", "movieId", "rating", "timestamp"]
        df.to_excel(writer, res_name[i])
        writer.save()
    print("data_output:", time.perf_counter() - t0)

def data_output_csv(path, res, res_name):
    t0 = time.perf_counter()
    for i in range(len(res)):
        f = open(path+"\\"+res_name[i]+".csv", "w")
        f.write(",".join(["userId", "movieId", "rating", "timestamp"]) + "\n")
        for row in res[i]:
            f.write(",".join([str(s) for s in row]) + "\n")
        f.close()
    print("data_output:", time.perf_counter() - t0)

if __name__ == '__main__':

    ratings = data_read()
    times, times_second = times_generate()
    ratings_year = ratings_year_generate(ratings, times_second)
    data_output_csv("D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year", ratings_year, times)

2、数据录入

import time
import re

class Edge:
    def __init__(self, user_id, item_id, score):
        self.user_id = user_id
        self.item_id = item_id
        self.score = score

    def __repr__(self):
        return "[" + str(self.user_id) + ", " + str(self.item_id) + ", " + str(self.score) + "]"

def Edge_read(ratings_train_path, ratings_test_path):

    t0 = time.perf_counter()

    #文件较大，逐行读
    with open(ratings_train_path) as f:
        f.readline()
        row = re.split(',', f.readline())
        ratings_train = []
        while len(row) > 1:
            ratings_train.append(Edge(int(row[0]), int(row[1]), float(row[2])))
            row = re.split(',', f.readline())

    with open(ratings_test_path) as f:
        f.readline()
        row = re.split(',', f.readline())
        ratings_test = []
        while len(row) > 1:
            ratings_test.append(Edge(int(row[0]), int(row[1]), float(row[2])))
            row = re.split(',', f.readline())

    print("time_data_read:", time.perf_counter() - t0)

    return ratings_train, ratings_test


if __name__ == '__main__':

    ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
                                            ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
    print("len(ratings_train):", len(ratings_train))
    print("ratings_train:", ratings_train[:10])
    print("len(ratings_test):", len(ratings_test))
    print("ratings_test:", ratings_test[:10])

3、贪心、随机、UserCF、ItemCF

from ml_25m.Data_read import *
import random as rd
import math

class parameter_items_popular:
    def __init__(self):
        self.L = []
    def __repr__(self):
        return str(self.L)

    def TopK(self, p, K):
        return self.L[:min(len(self.L), K)]

def train_items_popular(ratings_train, score_is_avg = False):

    t0 = time.perf_counter()

    D = {}
    C = {}
    for edge in ratings_train:
        if edge.item_id not in D:
            D[edge.item_id] = 0
            C[edge.item_id] = 0
        D[edge.item_id] += edge.score
        C[edge.item_id] += 1

    if score_is_avg:
        D = {k: D[k]/C[k] for k in D}

    L = [[k,v] for k,v in D.items()]
    L.sort(key=lambda x:x[1], reverse=True)

    P = parameter_items_popular()
    P.L = [row[0] for row in L]

    print("\ntime_train_items_popular:", time.perf_counter() - t0)

    return P

class parameter_random:
    def __init__(self):
        self.L = []
    def __repr__(self):
        return str(self.L)

    def TopK(self, p, K):
        return rd.sample(self.L, min(len(self.L), K))

def train_random(ratings_train):

    t0 = time.perf_counter()

    S = set()
    for edge in ratings_train:
        S.add(edge.item_id)

    P = parameter_random()
    P.L = rd.shuffle(list(S))

    print("\ntime_train_random:", time.perf_counter() - t0)

    return P

class parameter_UserCF:
    def __init__(self):
        self.L = []
        self.D = {}
    def __repr__(self):
        return str(self.L)

    def TopK(self, p, K):
        if p in self.D:
            return self.D[p][:min(len(self.D[p]), K)]
        return self.L[:min(len(self.L), K)]

def train_UserCF(ratings_train, is_absolute):
    t0 = time.perf_counter()

    F = {}
    for edge in ratings_train:
        if edge.user_id not in F:
            F[edge.user_id] = {}
        if edge.item_id not in F[edge.user_id]:
            F[edge.user_id][edge.item_id] = edge.score

    F_UI = {}
    for edge in ratings_train:
        if edge.user_id not in F_UI:
            F_UI[edge.user_id] = set()
        F_UI[edge.user_id].add(edge.item_id)

    #选择最近的十个邻居，按评分由高到低给
    D = {}
    for p in F_UI:
        if is_absolute:
            D_p = {p1:len(F_UI[p]&F_UI[p1]) for p1 in F_UI}
        else:
            D_p = {p1: len(F_UI[p] & F_UI[p1]) / len(F_UI[p1]) for p1 in F_UI}
        pair = [[k, v] for k, v in D_p.items()]
        pair.sort(key=lambda x:x[1], reverse=True)
        N = [e[0] for e in pair[:min(len(pair), 11)]]
        C = {}
        for p1 in N:
            for item in F[p1]:
                if item not in C:
                    C[item] = 0
                C[item] += F[p1][item]
        pair1 = [[k, v] for k, v in C.items()]
        pair1.sort(key=lambda x: x[1], reverse=True)
        R = [e[0] for e in pair1 if e[0] not in F_UI[p]]
        D[p] = R

    D1 = {}
    for edge in ratings_train:
        if edge.item_id not in D1:
            D1[edge.item_id] = 0
        D1[edge.item_id] += edge.score

    L = [[k,v] for k,v in D1.items()]
    L.sort(key=lambda x:x[1], reverse=True)

    P = parameter_UserCF()
    P.L = [row[0] for row in L]
    P.D = D

    print("\ntrain_UserCF:", time.perf_counter() - t0)

    return P

class parameter_ItemCF:
    def __init__(self):
        self.L = []
        self.D = {}
    def __repr__(self):
        return str(self.L)

    def TopK(self, p, K):
        if p in self.D:
            return self.D[p][:min(len(self.D[p]), K)]
        return self.L[:min(len(self.L), K)]

def train_ItemCF(ratings_train):

    t0 = time.perf_counter()
    F_IU = {}
    for edge in ratings_train:
        if edge.item_id not in F_IU:
            F_IU[edge.item_id] = set()
        F_IU[edge.item_id].add(edge.user_id)

    C = {item:{item1:len(F_IU[item]&F_IU[item1])/len(F_IU[item]|F_IU[item1]) for item1 in F_IU} for item in F_IU}

    F_UI = {}
    for edge in ratings_train:
        if edge.user_id not in F_UI:
            F_UI[edge.user_id] = set()
        F_UI[edge.user_id].add(edge.item_id)

    D = {}
    for p in F_UI:
        count = {item:0 for item in C}
        for item in F_UI[p]:
            for item1 in C[item]:
                count[item1] += C[item][item1]
        for item in F_UI[p]:
            count[item] = -1
        pairs = [[k, v] for k, v in count.items()]
        pairs.sort(key=lambda x:x[1], reverse=True)
        D[p] = [p[0] for p in pairs]

    D1 = {}
    for edge in ratings_train:
        if edge.item_id not in D1:
            D1[edge.item_id] = 0
        D1[edge.item_id] += edge.score

    L = [[k,v] for k,v in D1.items()]
    L.sort(key=lambda x:x[1], reverse=True)

    P = parameter_UserCF()
    P.L = [row[0] for row in L]
    P.D = D

    print("\ntrain_ItemCF:", time.perf_counter() - t0)

    return P


if __name__ == '__main__':

    ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
                                            ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")

    # P = train_items_popular(ratings_train, score_is_avg = False)
    # print(P.L)

    # P = train_random(ratings_train)
    # print(P.L)
    # print(P.TopK(p=1, K=10))
    # print(P.TopK(p=1, K=10))

    #2058s
    # P = train_UserCF(ratings_train, is_absolute=False)

    P = train_ItemCF(ratings_train)

4、一些常用的评价指标

from ml_25m.TopN.Data_train import *
import math

def DCG(r, i):
    return (pow(2, r)-1)/math.log2(i+1)

def F_generate(ratings):
    F = {}
    for edge in ratings:
        if edge.user_id not in F:
            F[edge.user_id] = {}
        if edge.item_id not in F[edge.user_id]:
            F[edge.user_id][edge.item_id] = edge.score
    return F

def L_personalized_generation(F, P, p, K, is_sup):
    if is_sup:
        L = [[k, v] for k, v in F[p].items()]
        L.sort(key=lambda x: x[1], reverse=True)
        L = [row[0] for row in L]
        L = L[:min(len(L), K)]
    else:
        L = P.TopK(p, K)
    return L

#1-CG:cumalative gain
def test_count_score_CG(ratings_test, P, K, is_print=False, is_sup=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = L_personalized_generation(F, P, p, K, is_sup)
        score_tot += sum([F[p][item] for item in res_P if item in F[p]])
    if is_print:
        print("\nscore_CG:", score_tot/len(F))
        print("time_test_count_score_CG:", time.perf_counter() - t0)
    return score_tot/len(F)

#2-DCG:discount CG
def test_count_score_DCG(ratings_test, P, K, is_print=False, is_sup=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = L_personalized_generation(F, P, p, K, is_sup)
        score_tot += sum([DCG(F[p][res_P[i]], i + 1) for i in range(len(res_P)) if res_P[i] in F[p]])
    if is_print:
        print("\nscore_DCG:", score_tot / len(F))
        print("time_test_count_score_DCG:", time.perf_counter() - t0)
    return score_tot/len(F)

#IDCG:ideal DCG
#3-NDCG:normalized DCG
def test_count_score_NDCG(ratings_test, P, K, is_print=False, is_sup=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = L_personalized_generation(F, P, p, K, is_sup)
        L_ori = [F[p][res_P[i]] if res_P[i] in F[p] else 0 for i in range(len(res_P))]
        score_ori = sum([DCG(L_ori[i], i + 1) for i in range(len(L_ori))])
        L_ideal = sorted(L_ori, reverse=True)
        score_ideal = sum([DCG(L_ideal[i], i + 1) for i in range(len(L_ideal))])
        if score_ideal > 0:
            score_tot += score_ori / score_ideal
    if is_print:
        print("\nscore_NDCG:", score_tot / len(F))
        print("time_test_count_score_NDCG:", time.perf_counter() - t0)
    return score_tot/len(F)

#下面两个指标与同时出现有关，与分数无关
#4-precision
def test_count_score_precision(ratings_test, P, K, is_print=False, is_sup=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = L_personalized_generation(F, P, p, K, is_sup)
        score_tot += sum([1 for item in res_P if item in F[p]]) / K
    if is_print:
        print("\nscore_precision:", score_tot / len(F))
        print("time_test_count_score_precision:", time.perf_counter() - t0)
    return score_tot/len(F)

#5-recall
def test_count_score_recall(ratings_test, P, K, is_print=False, is_sup=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = L_personalized_generation(F, P, p, K, is_sup)
        score_tot += sum([1 for item in res_P if item in F[p]]) / len(F[p])
    if is_print:
        print("\nscore_recall:", score_tot / len(F))
        print("time_test_count_score_recall:", time.perf_counter() - t0)
    return score_tot/len(F)

#6-K_Call
def test_count_score_K_Call(ratings_test, P, K, is_print=False, is_sup=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = L_personalized_generation(F, P, p, K, is_sup)
        K_K_Call = 10
        I = sum([1 for i in range(len(res_P)) if res_P[i] in F[p]])
        if I >= K_K_Call:
            score_tot += 1
    if is_print:
        print("\nscore_K_Call:", score_tot / len(F))
        print("\ntime_test_count_score_K_Call:", time.perf_counter() - t0)
    return score_tot/len(F)

def items_generate(ratings):
    items = {}
    for edge in ratings:
        if edge.item_id not in items:
            items[edge.item_id] = set()
        items[edge.item_id].add(edge.user_id)
    return items

def correlation_generate(items):
    C = {}
    for i in items:
        for j in items:
            if (i, j) not in C:
                U = items[i]|items[j]
                I = items[i]&items[j]
                C[i, j] = C[j, i] = len(I)/len(U)
    return C

#7-IntraSim
def test_count_diversity_IntraSim(ratings_test, P, K, is_print=False):
    t0 = time.perf_counter()
    items = items_generate(ratings_test)
    C = correlation_generate(items)
    F = F_generate(ratings_test)
    score_tot = 0
    for p in F:
        res_P = P.TopK(p, K)
        if K > 1:
            score_tot += sum([C[res_P[i], res_P[j]] for i in range(K) for j in range(K) if i != j]) / 2 / K / (K - 1)
    if is_print:
        print("\nscore_test_count_diversity_IntraSim:", score_tot / len(F))
        print("time_test_count_diversity_IntraSim:", time.perf_counter() - t0)
    return score_tot / len(F)

#8-InterDiv
def test_count_diversity_InterDiv(ratings_test, P, K, is_print=False):
    t0 = time.perf_counter()
    F = F_generate(ratings_test)
    score_tot = 0
    L_res = [set(P.TopK(p, K)) for p in F]
    for i in range(len(L_res) - 1):
        for j in range(i + 1, len(L_res)):
            score_tot += (K - len(L_res[i] & L_res[j])) / K
    score_tot = score_tot/len(F)/(len(F)-1)
    if is_print:
        print("\nscore_test_count_diversity_InterDiv:", score_tot)
        print("time_test_count_diversity_InterDiv:", time.perf_counter() - t0)
    return score_tot

#9-COV
def test_count_diversity_COV(ratings_test, P, K, is_print=False):
    t0 = time.perf_counter()
    items = items_generate(ratings_test)
    F = F_generate(ratings_test)
    all = set()
    for p in F:
        all = all | set(P.TopK(p, K))
    score_tot = len(all) / len(items)
    if is_print:
        print("\nscore_test_count_diversity_COV:", score_tot)
        print("time_test_count_diversity_COV:", time.perf_counter() - t0)
    return score_tot

if __name__ == '__main__':

    ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
                                            ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
    P = train_items_popular(ratings_train, score_is_avg = False)

    score_CG = test_count_score_CG(ratings_test, P, K=10, is_print=True)
    score_sup_CG = test_count_score_CG(ratings_test, P, K=10, is_print=False, is_sup=True)
    print("score_sup_CG:", score_sup_CG)

    # score_DCG = test_count_score_DCG(ratings_test, P, K=100, is_print=True)
    # score_sup_DCG = test_count_score_DCG(ratings_test, P, K=100, is_print=False, is_sup=True)
    # print("score_sup_DCG:", score_sup_CG)
    #
    # score_NDCG = test_count_score_NDCG(ratings_test, P, K=100, is_print=True)
    # score_sup_NDCG = test_count_score_NDCG(ratings_test, P, K=100, is_print=False, is_sup=True)
    # print("score_sup_NDCG:", score_sup_NDCG)
    #
    # score_precision = test_count_score_precision(ratings_test, P, K=100, is_print=True)
    # score_sup_precision = test_count_score_precision(ratings_test, P, K=100, is_print=False, is_sup=True)
    # print("score_sup_precision:", score_sup_precision)
    #
    # score_recall = test_count_score_recall(ratings_test, P, K=100, is_print=True)
    # score_sup_recall = test_count_score_recall(ratings_test, P, K=100, is_print=False, is_sup=True)
    # print("score_sup_recall:", score_sup_recall)
    #
    # score_K_Call = test_count_score_K_Call(ratings_test, P, K=100, is_print=True)
    # score_sup_K_Call = test_count_score_K_Call(ratings_test, P, K=100, is_print=False, is_sup=True)
    # print("score_sup_K_Call:", score_sup_K_Call)
    #
    # score_intraSim = test_count_diversity_IntraSim(ratings_test, P, K=100, is_print=True)
    # score_InterDiv = test_count_diversity_InterDiv(ratings_test, P, K=100, is_print=True)
    # score_COV = test_count_diversity_COV(ratings_test, P, K=100, is_print=True)

5、评估贪心、随机、UserCF、ItemCF四种方法在不同K取值下的表现，这里评价指标采用准确率、召回率和覆盖度

from ml_25m.TopN.Data_test import *

def normal_num(num, n):
    a = int(pow(10, n))
    return round(num*a)/a

def test_and_print(P, ratings_test):
    L_precision = []
    L_recall = []
    L_COV = []

    for K in [5, 10, 20, 40, 80, 160]:
        score_precision = test_count_score_precision(ratings_test, P, K=K)
        score_recall = test_count_score_recall(ratings_test, P, K=K)
        score_COV = test_count_diversity_COV(ratings_test, P, K=K)
        L_precision.append(score_precision)
        L_recall.append(score_recall)
        L_COV.append(score_COV)

    L_precision = [normal_num(num, 2) for num in L_precision]
    L_recall = [normal_num(num, 2) for num in L_recall]
    L_COV = [normal_num(num, 4) for num in L_COV]
    print("\nL_precision: \n", L_precision)
    print("\nL_recall: \n", L_recall)
    print("\nL_COV: \n", L_COV)

if __name__ == '__main__':

    ratings_train, ratings_test = Edge_read(ratings_train_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1996.csv",
                                            ratings_test_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings_year\\1997.csv")
    #1-items_popular
    P = train_items_popular(ratings_train, score_is_avg = False)
    test_and_print(P, ratings_test)
    #
    # #2-random
    # P = train_random(ratings_train)
    # test_and_print(P, ratings_test)
    #
    # #3-UserCF
    # P = train_UserCF(ratings_train, is_absolute=True)
    # test_and_print(P, ratings_test)
    # P = train_UserCF(ratings_train, is_absolute=False)
    # test_and_print(P, ratings_test)

    # 4-itemCF
    # P = train_ItemCF(ratings_train)
    # test_and_print(P, ratings_test)

"""
	        K	            5	    10	    20	    40	    80	    160
precision	items_popular	0.17	0.16	0.14	0.12	0.16	0.13
	        random	        0.04	0.04	0.04	0.04	0.04	0.04
	        UserCF_abs	    0.19	0.17	0.15	0.13	0.16	0.13
	        UserCF          0.18,   0.17,   0.14,   0.12,   0.15,   0.13
	        itemCF          0.18,   0.17,   0.14,   0.13,   0.16,   0.14
recall	    items_popular	0.01	0.02	0.03	0.05	0.22	0.39
	        random	        0	    0.01	0.01	0.03	0.05	0.11
	        UserCF_abs	    0.01	0.02	0.04	0.07	0.23	0.39
	        UserCF          0.01    0.02    0.03    0.06    0.21    0.35
	        itemCF          0.01,   0.02,   0.04,   0.06,   0.23,   0.4
COV	        items_popular	0.003	0.006	0.012	0.0239	0.0478	0.0957
	        random	        0.8272	0.8272	0.8272	0.8272	0.8272	0.8272
	        UserCF_abs	    0.1818	0.2428	0.326	0.4234	0.5167	0.6136
	        UserCF          0.1519  0.2339  0.3278  0.4276  0.4725  0.4791
	        itemCF          0.128,  0.1669, 0.2279, 0.314,  0.4061, 0.5042
"""

由于准确率反应的是推荐列表中用户感兴趣的商品比例，其与K成负相关，这里平均准确率：UserCF_abs > itemCF > UserCF > items_popular > random

召回率反应的是推荐列表占用户全部感兴趣商品的比例，其与K成正相关，这里平均召回率：UserCF_abs > itemCF > items_popular > UserCF > random

覆盖度反应的是所有推荐结果占总商品的比例，其与K成正相关，这里平均覆盖度：random > UserCF_abs > UserCF > itemCF > items_popular

chaoyuzhang

关注

0
点赞
踩
9

收藏

觉得还不错? 一键收藏
1
评论
65、Top-N推荐系统的协同过滤方法

数据为movielen数据，这里仅用其中的ratings数据1、数据按年份切分import refrom datetime import datetimeimport timeimport pandas as pddef data_read(): t0 = time.perf_counter() ratings_path = r"D:\\!data\\3-recommender\\27-ml-25m\\ml-25m\\ratings.csv" # 文件较大，
复制链接

扫一扫

专栏目录