MovieTaster-使用Item2Vec做电影推荐代码解析

在做推荐的时候了解到,可以将电影进行向量的训练,根据向量的相似度来做推荐,下载了MovieTaster代码,源代码地址 https://github.com/lujiaying/MovieTaster-Open,根据代码的数据来运行了一遍该程序。

1.是process.py ,代码中iteritems()在python3中为items()

import json

DoulistFile = './datas/doulist_0804_09.json'
MovieFile = './datas/movie_0804_09.json'
DoulistCorpusIdFile = DoulistFile.replace('json', 'movie_id')
DoulistCorpusNameFile = DoulistFile.replace('json', 'movie_name')

def get_movie_name_id_dict(doulist_file=DoulistFile, min_word_freq=0):
    movie_counter = {}
    with open(doulist_file) as fopen:
        for line in fopen:
            doulist_dict = json.loads(line.strip())
            for movie_name in doulist_dict['movie_names']:
                movie_name = movie_name
                if movie_name not in movie_counter:
                    movie_counter[movie_name] = 0
                movie_counter[movie_name] += 1            #对每个影片名称计数
    movie_freq = filter(lambda _:_[1] >= min_word_freq, movie_counter.iteritems())
    movie_counter_sorted = sorted(movie_freq, key=lambda x: (-x[1], x[0]))
    movies, _ = list(zip(*movie_counter_sorted))
    movie_name_id_dict = dict(zip(movies, xrange(len(movies))))
    movie_name_id_dict['<unk>'] = len(movies)
    print('movie_name_id_dict is %d from [%s]' % (len(movie_name_id_dict), doulist_file))
    return movie_name_id_dict  #返回的内容是对影片从0-len(movies)标号 
    #如 {'战狼2': 0, '12回合3:致命禁闭': 1,。。。。,'<unk>': len(movies)}

#获取影片对应的id序号,序号是从movie_name_id_dict中获取的。该函数在eval中调用
def get_movie_id_name_dict(doulist_file=DoulistFile):
    movie_name_id_dict = get_movie_name_id_dict(doulist_file)
    movie_id_name_dict = dict([(_[1], _[0]) for _ in movie_name_id_dict.iteritems()])
    print('movie_id_name_dict is %d from [%s]' % (len(movie_id_name_dict), doulist_file))
    return movie_id_name_dict

#对文本处理的主程序函数
def process2corpus():
    movie_name_id_dict = get_movie_name_id_dict()
    print('total movie is %d from [%s], [%s]' % (len(movie_name_id_dict), DoulistFile, MovieFile))
    unk_id = 0
    with open(DoulistFile) as fopen, open(DoulistCorpusNameFile, 'w') as fwrite, open(DoulistCorpusIdFile, 'w') as fwrite_1:
        for line in fopen:
            doulist_dict = json.loads(line.strip()) #从文本中读取一行数据,进行处理
            doulist_movies = [_ for _ in doulist_dict['movie_names']]
            doulist_movie_ids = [str(movie_name_id_dict[_]) for _ in doulist_movies]
            fwrite.write('%s\n' % ('\t'.join(doulist_movies)))#把名字写入
            fwrite_1.write('%s\n' % (' '.join(doulist_movie_ids)))#把名字对应的数字写入

2.将doulist_movie_ids进行fasttext训练,生成对应的向量。
因为之前安装的是python的,没有使用fasttext链接的文件,直接在python中,fasttext.PyPI参数

import fasttext
model=fasttext.skipgram('./datas/doulist_0803_23.movie_id','./models/fasttext_model_0803_23',min_count=5,epoch=50,neg=100)

之后即生成向量模型。

3.生成向量后,根据夹角余弦计算相似度


def similarity(v1, v2):
    n1 = np.linalg.norm(v1) #向量的范数,等同于sqrt(sum(power(v1,2)) |v1|
    n2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / n1 / n2 #相当于夹角余弦计算相似度


def topk_like(cur_movie_name, k=5, print_log=False):
    global movie_name_id_dict
    global movie_id_name_dict
    global vectors
    min_heap = minHeap(k)
    like_candidates = []
    #logger.debug('vecotrs size=%d' % (len(vectors)))
    #logger.debug('cur_movie_name %s, %s' % (cur_movie_name, type(cur_movie_name)))
    #if isinstance(cur_movie_name, unicode):
      #  cur_movie_name = cur_movie_name.encode('utf8')

    if cur_movie_name not in movie_name_id_dict:
        #logger.info('%s not in movie_name_id_dict[%d]' % (cur_movie_name, len(movie_name_id_dict)))
        return []

    if cur_movie_name not in movie_name_id_dict:
        return []
    cur_movie_id = movie_name_id_dict[cur_movie_name]
    if cur_movie_id not in vectors:
        return []
    cur_vec = vectors[cur_movie_id]
    if print_log:
        logger.info('[%d]%s top %d likes:' % (cur_movie_id, cur_movie_name, k))
    for movie_id, vec in vectors.items():
        if movie_id == cur_movie_id:
            continue
        sim = similarity(cur_vec, vec)
        if len(like_candidates) < k or sim > min_heap.get_min():
            min_heap.add(sim)
            like_candidates.append((movie_id, sim))
    if print_log:
        for t in sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]:
            logger.info('[%d]%s %f' % (t[0], movie_id_name_dict[t[0]], t[1]))
    return sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值