在做推荐的时候了解到,可以将电影进行向量的训练,根据向量的相似度来做推荐,下载了MovieTaster代码,源代码地址 https://github.com/lujiaying/MovieTaster-Open,根据代码的数据来运行了一遍该程序。
1.是process.py ,代码中iteritems()在python3中为items()
import json
DoulistFile = './datas/doulist_0804_09.json'
MovieFile = './datas/movie_0804_09.json'
DoulistCorpusIdFile = DoulistFile.replace('json', 'movie_id')
DoulistCorpusNameFile = DoulistFile.replace('json', 'movie_name')
def get_movie_name_id_dict(doulist_file=DoulistFile, min_word_freq=0):
movie_counter = {}
with open(doulist_file) as fopen:
for line in fopen:
doulist_dict = json.loads(line.strip())
for movie_name in doulist_dict['movie_names']:
movie_name = movie_name
if movie_name not in movie_counter:
movie_counter[movie_name] = 0
movie_counter[movie_name] += 1 #对每个影片名称计数
movie_freq = filter(lambda _:_[1] >= min_word_freq, movie_counter.iteritems())
movie_counter_sorted = sorted(movie_freq, key=lambda x: (-x[1], x[0]))
movies, _ = list(zip(*movie_counter_sorted))
movie_name_id_dict = dict(zip(movies, xrange(len(movies))))
movie_name_id_dict['<unk>'] = len(movies)
print('movie_name_id_dict is %d from [%s]' % (len(movie_name_id_dict), doulist_file))
return movie_name_id_dict #返回的内容是对影片从0-len(movies)标号
#如 {'战狼2': 0, '12回合3:致命禁闭': 1,。。。。,'<unk>': len(movies)}
#获取影片对应的id序号,序号是从movie_name_id_dict中获取的。该函数在eval中调用
def get_movie_id_name_dict(doulist_file=DoulistFile):
movie_name_id_dict = get_movie_name_id_dict(doulist_file)
movie_id_name_dict = dict([(_[1], _[0]) for _ in movie_name_id_dict.iteritems()])
print('movie_id_name_dict is %d from [%s]' % (len(movie_id_name_dict), doulist_file))
return movie_id_name_dict
#对文本处理的主程序函数
def process2corpus():
movie_name_id_dict = get_movie_name_id_dict()
print('total movie is %d from [%s], [%s]' % (len(movie_name_id_dict), DoulistFile, MovieFile))
unk_id = 0
with open(DoulistFile) as fopen, open(DoulistCorpusNameFile, 'w') as fwrite, open(DoulistCorpusIdFile, 'w') as fwrite_1:
for line in fopen:
doulist_dict = json.loads(line.strip()) #从文本中读取一行数据,进行处理
doulist_movies = [_ for _ in doulist_dict['movie_names']]
doulist_movie_ids = [str(movie_name_id_dict[_]) for _ in doulist_movies]
fwrite.write('%s\n' % ('\t'.join(doulist_movies)))#把名字写入
fwrite_1.write('%s\n' % (' '.join(doulist_movie_ids)))#把名字对应的数字写入
2.将doulist_movie_ids进行fasttext训练,生成对应的向量。
因为之前安装的是python的,没有使用fasttext链接的文件,直接在python中,fasttext.PyPI参数
import fasttext
model=fasttext.skipgram('./datas/doulist_0803_23.movie_id','./models/fasttext_model_0803_23',min_count=5,epoch=50,neg=100)
之后即生成向量模型。
3.生成向量后,根据夹角余弦计算相似度
def similarity(v1, v2):
n1 = np.linalg.norm(v1) #向量的范数,等同于sqrt(sum(power(v1,2)) |v1|
n2 = np.linalg.norm(v2)
return np.dot(v1, v2) / n1 / n2 #相当于夹角余弦计算相似度
def topk_like(cur_movie_name, k=5, print_log=False):
global movie_name_id_dict
global movie_id_name_dict
global vectors
min_heap = minHeap(k)
like_candidates = []
#logger.debug('vecotrs size=%d' % (len(vectors)))
#logger.debug('cur_movie_name %s, %s' % (cur_movie_name, type(cur_movie_name)))
#if isinstance(cur_movie_name, unicode):
# cur_movie_name = cur_movie_name.encode('utf8')
if cur_movie_name not in movie_name_id_dict:
#logger.info('%s not in movie_name_id_dict[%d]' % (cur_movie_name, len(movie_name_id_dict)))
return []
if cur_movie_name not in movie_name_id_dict:
return []
cur_movie_id = movie_name_id_dict[cur_movie_name]
if cur_movie_id not in vectors:
return []
cur_vec = vectors[cur_movie_id]
if print_log:
logger.info('[%d]%s top %d likes:' % (cur_movie_id, cur_movie_name, k))
for movie_id, vec in vectors.items():
if movie_id == cur_movie_id:
continue
sim = similarity(cur_vec, vec)
if len(like_candidates) < k or sim > min_heap.get_min():
min_heap.add(sim)
like_candidates.append((movie_id, sim))
if print_log:
for t in sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]:
logger.info('[%d]%s %f' % (t[0], movie_id_name_dict[t[0]], t[1]))
return sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]