数据和工具
1. glove:http://nlp.stanford.edu/projects/glove/
2. 书单数据:待审核
代码
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from scipy import stats
import sys
import os
import math
import json
import heapq
#VecFile = './models/fasttext_model_0804_09_cbow.vec'
VecFile = '../glove/vectors.txt'
class minHeap():
def __init__(self, k):
self._k = k
self._heap = []
def add(self, item):
if len(self._heap) < self._k:
self._heap.append(item)
heapq.heapify(self._heap)
else:
if item > self._heap[0]:
self._heap[0] = item
heapq.heapify(self._heap)
def get_min(self):
if len(self._heap) > 0:
return self._heap[0]
else:
return -2
def get_all(self):
return self._heap
def similarity(v1, v2):
n1 = np.linalg.norm(v1)
n2 = np.linalg.norm(v2)
return np.dot(v1, v2) / n1 / n2
def load_vectors(input_file=VecFile):
vectors = {}
with open(VecFile) as fopen:
fopen.readlines()
for line in fopen:
line_list = line.strip().split()
movie_name = line_list[0]
vec = np.array([float(_) for _ in line_list[1:]], dtype=float)
if not movie_name in vectors:
vectors[movie_name] = vec
return vectors
def topk_like(cur_movie_name, k=5, print_log=False):
global vectors
min_heap = minHeap(k)
like_candidates = []
#logger.debug('vecotrs size=%d' % (len(vectors)))
#logger.debug('cur_movie_name %s, %s' % (cur_movie_name, type(cur_movie_name)))
if cur_movie_name not in vectors.keys():
print(cur_movie_name+" not in vector")
return []
cur_vec = vectors[cur_movie_name]
if print_log:
print('%s top %d likes:' % (cur_movie_name, k))
for movie_name, vec in vectors.items():
if movie_name == cur_movie_name:
continue
sim = similarity(cur_vec, vec)
if len(like_candidates) < k or sim > min_heap.get_min():
min_heap.add(sim)
like_candidates.append((movie_name, sim))
if print_log:
for t in sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]:
print('\t%s %f' % (t[0],t[1]))
return sorted(like_candidates, reverse=True, key=lambda _:_[1])[:k]
vectors = load_vectors(VecFile)
if __name__ == '__main__':
movie_names = [u'追风筝的人', u'看见', u'长尾理论', u'从0到1']
for movie_name in movie_names:
topk_like(movie_name, print_log=True)
#generate_movie_topk_like_result('./output/leancloud_movie_fasttext.json', k=25)
效果
追风筝的人 top 5 likes:
HTML 0.433426
陆小凤传奇 0.423955
百年孤独 0.408164
战胜华尔街 0.403829
沉默的大多数 0.390571
看见 top 5 likes:
雪山飞狐 0.418104
鹿鼎记(全五册) 0.407155
概念力 0.405503
梦里花落知多少 0.377425
股票大作手操盘术 0.353401
长尾理论 top 5 likes:
怪诞行为学 0.664615
全球32位顶尖广告文案的写作之道 0.460249
人人都是产品经理 0.429592
傲慢与偏见 0.412715
七堂极简物理课 0.357686
从0到1 top 5 likes:
创业维艰 0.696322
浪潮之巅 0.477051
诛仙 0.358522
Scikit-Learn 0.354489
Book 0.334162