前言
根据Unsplash网站图片的tags,计算所需图片的tags和所需图片所在类的其它图片的余弦相似度,取相似度最大的某些图片。
提示:爬取tags是json格式,数据库连接使用mongodb。
一、根据tags,在同一类别和作者中推荐
代码如下:
#保证在同一类别下计算余弦相似度,排序,获得前6个,排除自己一个,获得前5个
import pymongo
import math
import re
import time
#starttime=time.time()
def compute_cosine(text_a, text_b):
# 找单词及词频
words1 = text_a.split(' ')
words2 = text_b.split(' ')
# print(words1)
words1_dict = {
}
words2_dict = {
}
for word in words1:
# word = word.strip(",.?!;")
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
# print(word)
if word != '' and word in words1_dict:
num = words1_dict[word]
words1_dict[word] = num + 1
elif word != '':
words1_dict[word] = 1
else:
continue
for word in words2:
# word = word.strip(",.?!;")
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
if word != '' and word in words2_dict:
num = words2_dict[word]
words2_dict[word] = num + 1
elif word != '':
words2_dict[word] = 1
else:
continue
#print(words1_dict)
#print(words2_dict)
# 排序
dic1 = sorted(words1_dict.items(), key=lambda asd: asd[1], reverse=True)
dic2 = sorted(words2_dict.items(), key=lambda asd: asd[1], reverse=True)
#print(dic1)
#print(dic2)
# 得到词向量
words_key = []
for i in range(len(dic1)):
words_key.append(dic1[i][0]) # 向数组中添加元素
for i in range(len(dic2)):
if dic2[i][0] in words_key:
# print 'has_key', dic2[i][0]
pass
else: # 合并
words_key.append(dic2[i][0])
# print(words_key)
vect1 = []
vect2 = []
for word in words_key:
if word in words1_dict:
vect1.append(words1_dict[word])
else:
vect1.append(0)