推荐系统实战(十)基于时间上下文

1.数据集:delicious-2k

2.Code:
很好的封装性,代码写的很到位
github地址

# 导入包
import random
import math
import time
import codecs
from tqdm import tqdm


class Dataset():

    # 对每个用户按照时间进行从前到后的排序,取最后一个时间的item作为要预测的测试集

    def __init__(self, site=None):
        # site: which site to load
        self.bookmark_path = '../data/delicious-2k/bookmarks.dat'
        self.user_bookmark_path = '../data/delicious-2k/user_taggedbookmarks-timestamps.dat'
        self.site = site
        self.loadData()

    def loadData(self):
        bookmarks = [f.strip() for f in codecs.open(self.bookmark_path, 'r', encoding="ISO-8859-1").readlines()][1:]
        site_ids = {}
        for b in bookmarks:
            b = b.split('\t')
            if b[-1] not in site_ids:
                site_ids[b[-1]] = set()
            site_ids[b[-1]].add(b[0])

        user_bookmarks = [f.strip() for f in
                          codecs.open(self.user_bookmark_path, 'r', encoding="ISO-8859-1").readlines()][1:]
        data = {}
        cnt = 0
        for ub in user_bookmarks:
            ub = ub.split('\t')
            # if site is None or (site in site_ids and ub[1] in site_ids[site]):
            if self.site is None or (self.site in site_ids and ub[1] in site_ids[self.site]):
                if ub[0] not in data:
                    data[ub[0]] = set()
                data[ub[0]].add((ub[1], int(ub[3][:-3])))
                cnt += 1
        self.data = {k: list(sorted(list(data[k]), key=lambda x: x[1], reverse=True)) for k in data}

    def splitData(self):
        '''
        :params: data, 加载的所有(user, item)数据条目
        :return: train, test
        '''
        train, test = {}, {}
        for user in self.data:
            if user not in train:
                train[user] = []
                test[user] = []
            data = self.data[user]
            train[user].extend(data[1:])
            test[user].append(data[0])

        return train, test


class Metric():

    def __init__(self, train, test, GetRecommendation):
        '''
        :params: train, 训练数据
        :params: test, 测试数据
        :params: GetRecommendation, 为某个用户获取推荐物品的接口函数
        '''
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()

    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs

    # 定义精确率指标计算方式
    def precision(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set([x[0] for x in self.test[user]])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(rank)
        return round(hit / all * 100, 2) if all > 0 else 0.0

    # 定义召回率指标计算方式
    def recall(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set([x[0] for x in self.test[user]])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(test_items)
        return round(hit / all * 100, 2) if all > 0 else 0.0

    def eval(self):
        metric = {'Precision': self.precision(),
                  'Recall': self.recall()}
        return metric


# 1. 给用户推荐近期最热门的物品
def RecentPopular(train, K, N, alpha=1.0, t0=int(time.time())):
    '''
    :params: train, 训练数据集
    :params: K, 可忽略
    :params: N, 超参数,设置取TopN推荐物品数目
    :params: alpha, 时间衰减因子
    :params: t0, 当前的时间戳
    :return: GetRecommendation,推荐接口函数
    '''

    item_score = {}
    for user in train:
        for item, t in train[user]:
            if item not in item_score:
                item_score[item] = 0
            item_score[item] += 1.0 / (alpha * (t0 - t))

    item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))

    def GetRecommendation(user):
        # 随机推荐N个未见过的
        user_items = set(train[user])
        rec_items = [x for x in item_score if x[0] not in user_items]
        return rec_items[:N]

    return GetRecommendation


# 2. 时间上下文相关的ItemCF算法
def TItemCF(train, K, N, alpha=1.0, beta=1.0, t0=int(time.time())):
    '''
    :params: train, 训练数据集
    :params: K, 超参数,设置取TopK相似物品数目
    :params: N, 超参数,设置取TopN推荐物品数目
    :params: alpha, 计算item相似度的时间衰减因子
    :params: beta, 推荐打分时的时间衰减因子
    :params: t0, 当前的时间戳
    :return: GetRecommendation, 推荐接口函数
    '''
    # 计算物品相似度矩阵
    sim = {}
    num = {}
    for user in train:
        items = train[user]
        for i in range(len(items)):
            u, t1 = items[i]
            if u not in num:
                num[u] = 0
            num[u] += 1
            if u not in sim:
                sim[u] = {}
            for j in range(len(items)):
                if j == i: continue
                v, t2 = items[j]
                if v not in sim[u]:
                    sim[u][v] = 0
                sim[u][v] += 1.0 / (alpha * (abs(t1 - t2) + 1))
    for u in sim:
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u] * num[v])

    # 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), \
                                      key=lambda x: x[1], reverse=True)) \
                       for k, v in sim.items()}

    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        seen_items = set(train[user])
        for item, t in train[user]:
            for u, _ in sorted_item_sim[item][:K]:
                if u not in seen_items:
                    if u not in items:
                        items[u] = 0
                    items[u] += sim[item][u] / (1 + beta * (t0 - t))
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs

    return GetRecommendation


# 3.时间上下文相关的UserCF算法


def TUserCF(train, K, N, alpha=1.0, beta=1.0, t0=int(time.time())):
    '''
    :params: train, 训练数据集
    :params: K, 超参数,设置取TopK相似用户数目
    :params: N, 超参数,设置取TopN推荐物品数目
    :params: alpha, 计算item相似度的时间衰减因子
    :params: beta, 推荐打分时的时间衰减因子
    :params: t0, 当前的时间戳
    :return: GetRecommendation, 推荐接口函数
    '''
    # 计算item->user的倒排索引
    item_users = {}
    for user in train:
        for item, t in train[user]:
            if item not in item_users:
                item_users[item] = []
            item_users[item].append((user, t))

    # 计算用户相似度矩阵
    sim = {}
    num = {}
    for item in item_users:
        users = item_users[item]
        for i in range(len(users)):
            u, t1 = users[i]
            if u not in num:
                num[u] = 0
            num[u] += 1
            if u not in sim:
                sim[u] = {}
            for j in range(len(users)):
                if j == i: continue
                v, t2 = users[j]
                if v not in sim[u]:
                    sim[u][v] = 0
                sim[u][v] += 1.0 / (alpha * (abs(t1 - t2) + 1))
    for u in sim:
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u] * num[v])

    # 按照相似度排序
    sorted_user_sim = {k: list(sorted(v.items(), \
                                      key=lambda x: x[1], reverse=True)) \
                       for k, v in sim.items()}

    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        seen_items = set(train[user])
        recs = []
        if user in sorted_user_sim:
            for u, _ in sorted_user_sim[user][:K]:
                for item, _ in train[u]:
                    if item not in seen_items:
                        if item not in items:
                            items[item] = 0
                        items[item] += sim[user][u] / (1 + beta * (t0 - t))
            recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs

    return GetRecommendation


# 4. ItemCF算法
def ItemCF(train, K, N):
    '''
    :params: train, 训练数据集
    :params: K, 超参数,设置取TopK相似物品数目
    :params: N, 超参数,设置取TopN推荐物品数目
    :return: GetRecommendation, 推荐接口函数
    '''
    # 计算物品相似度矩阵
    sim = {}
    num = {}
    for user in train:
        items = train[user]
        for i in range(len(items)):
            u, _ = items[i]
            if u not in num:
                num[u] = 0
            num[u] += 1
            if u not in sim:
                sim[u] = {}
            for j in range(len(items)):
                if j == i: continue
                v, _ = items[j]
                if v not in sim[u]:
                    sim[u][v] = 0
                sim[u][v] += 1
    for u in sim:
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u] * num[v])

    # 按照相似度排序
    sorted_item_sim = {k: list(sorted(v.items(), \
                                      key=lambda x: x[1], reverse=True)) \
                       for k, v in sim.items()}

    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        seen_items = set(train[user])
        for item, _ in train[user]:
            for u, _ in sorted_item_sim[item][:K]:
                if u not in seen_items:
                    if u not in items:
                        items[u] = 0
                    items[u] += sim[item][u]
        recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs

    return GetRecommendation


# 5. UserCF算法
def UserCF(train, K, N):
    '''
    :params: train, 训练数据集
    :params: K, 超参数,设置取TopK相似用户数目
    :params: N, 超参数,设置取TopN推荐物品数目
    :return: GetRecommendation, 推荐接口函数
    '''
    # 计算item->user的倒排索引
    item_users = {}
    for user in train:
        for item, _ in train[user]:
            if item not in item_users:
                item_users[item] = []
            item_users[item].append(user)

    # 计算用户相似度矩阵
    sim = {}
    num = {}
    for item in item_users:
        users = item_users[item]
        for i in range(len(users)):
            u = users[i]
            if u not in num:
                num[u] = 0
            num[u] += 1
            if u not in sim:
                sim[u] = {}
            for j in range(len(users)):
                if j == i: continue
                v = users[j]
                if v not in sim[u]:
                    sim[u][v] = 0
                sim[u][v] += 1
    for u in sim:
        for v in sim[u]:
            sim[u][v] /= math.sqrt(num[u] * num[v])

    # 按照相似度排序
    sorted_user_sim = {k: list(sorted(v.items(), \
                                      key=lambda x: x[1], reverse=True)) \
                       for k, v in sim.items()}

    # 获取接口函数
    def GetRecommendation(user):
        items = {}
        seen_items = set(train[user])
        recs = []
        if user in sorted_user_sim:
            for u, _ in sorted_user_sim[user][:K]:
                for item, _ in train[u]:
                    # 要去掉用户见过的
                    if item not in seen_items:
                        if item not in items:
                            items[item] = 0
                        items[item] += sim[user][u]
            recs = list(sorted(items.items(), key=lambda x: x[1], reverse=True))[:N]
        return recs

    return GetRecommendation


class Experiment():

    def __init__(self, K, N, site=None, rt='RecentPopular'):
        '''
        :params: K, TopK相似的个数
        :params: N, TopN推荐物品的个数
        :params: site, 选择一个网站的记录进行推荐
        :params: rt, 推荐算法类型
        '''
        self.K = K
        self.N = N
        self.site = site
        self.rt = rt
        self.alg = {'RecentPopular': RecentPopular, 'TItemCF': TItemCF, \
                    'TUserCF': TUserCF, 'ItemCF': ItemCF, 'UserCF': UserCF}

    # 定义单次实验
    def worker(self, train, test):
        '''
        :params: train, 训练数据集
        :params: test, 测试数据集
        :return: 各指标的值
        '''
        getRecommendation = self.alg[self.rt](train, self.K, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()

    # 运行实验
    def run(self):
        dataset = Dataset(self.site)
        train, test = dataset.splitData()
        metric = self.worker(train, test)
        print('Result (site={}, K={}, N={}): {}'.format( \
            self.site, self.K, self.N, metric))



if __name__=='__main__':

    # 1. RecentPopular实验
    K = 0  # 为保持一致而设置,随便填一个值
    for site in ['www.nytimes.com', 'en.wikipedia.org']:
        for N in range(10, 110, 10):
            exp = Experiment(K, N, site=site, rt='RecentPopular')
            exp.run()

    # # 2. TItemCF实验
    # K = 10
    # for site in ['www.nytimes.com', 'en.wikipedia.org']:
    #     for N in range(10, 110, 10):
    #         exp = Experiment(K, N, site=site, rt='TItemCF')
    #         exp.run()
    #
    # # 3. TUserCF实验
    # K = 10
    # for site in ['www.nytimes.com', 'en.wikipedia.org']:
    #     for N in range(10, 110, 10):
    #         exp = Experiment(K, N, site=site, rt='TUserCF')
    #         exp.run()
    #
    # # 4. ItemCF实验
    # K = 10
    # for site in ['www.nytimes.com', 'en.wikipedia.org']:
    #     for N in range(10, 110, 10):
    #         exp = Experiment(K, N, site=site, rt='ItemCF')
    #         exp.run()
    #
    # # 5. UserCF实验
    # K = 10
    # for site in ['www.nytimes.com', 'en.wikipedia.org']:
    #     for N in range(10, 110, 10):
    #         exp = Experiment(K, N, site=site, rt='UserCF')
    #         exp.run()
  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值