embedding模型研究
嵌入(Embeddings)是机器学习领域中的一个概念,主要用于将高维的数据转化为低维空间,以便于算法更好地处理和理解数据。嵌入通常用于将离散的、高维的特征转换为连续的、低维的向量表示。
本周学习item2vec
2.实践
用word2vec实现item2vec。
import pandas as pd
import numpy as np
df_movies = pd.read_csv('./ml-20m/movies.csv')
df_ratings = pd.read_csv('./ml-20m/ratings.csv')
movieId_to_name = pd.Series(df_movies.title.values, index=df_movies.movieId.values).to_dict()
name_to_movieId = pd.Series(df_movies.movieId.values, index=df_movies.title).to_dict()
#randomly display 5 records in the dataframe
for df in list((df_movies, df_ratings)):
rand_idx = np.random.choice(len(df), 5, replace=False)
display(df.iloc[rand_idx,:])
print('Displaying 5 of the total ' + str(len(df)) + ' data points')
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,6))
ax = plt.subplot(111)
ax.set_title('Distribution of Movie Ratings', fontsize=16)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Movie Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.hist(df_ratings['rating'], color='#3F5D7D')
plt.show()
from sklearn.model_selection import train_test_split
df_ratings_train, df_rating_test = train_test_split(df_ratings, stratify=df_ratings['userId'], random_state=15688, test_size=0.30)
print('Number of training data:' +str(len(df_ratings_train)))
print('Number of test data:'+str(len(df_rating_test)))
def rating_splitter(df):
df['liked'] = np.where(df['rating']>=4, 1, 0)
df['movieId'] = df['movieId'].astype(str)
gp_user_like = df.groupby(['liked', 'userId'])
return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
pd.options.mode.chained_assignment = None
splitted_movies = rating_splitter(df_ratings_train)
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION > -1
import random
for movie_list in splitted_movies:
random.shuffle(movie_list)
from gensim.models import Word2Vec
import datetime
start = datetime.datetime.now()
#用word2vec的skip-gram+negative sampling
model = Word2Vec(sentences=splitted_movies, iter=2, min_count=10, size=30, workers=4, sg=1, hs=0, negative=5, window=9999999)
print('Time passed:' + str(datetime.datetime.now()-start))
model.save('item2vec_20200823')
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
#获取我们保存的item2vec模型
from gensim.models import Word2Vec
model = Word2Vec.load('item2vec_20200823')
word_vectors = model.wv
接下来就要将其应用到推荐中了,主要是使用most_similar_cosmul
import requests
import re
from bs4 import BeautifulSoup
def refine_search(search_term):
target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q="+"+".join(search_term.split())+"&s=tt"
html = requests.get(target_url).content
parsed_html = BeautifulSoup(html, 'html.parser')
for tag in parsed_html.find_all('td', class_="result_text"):
search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
if search_result:
if search_result[0][0].split()[0]=="The":
str_frac = " ".join(search_result[0][0].split()[1:])+", "+search_result[0][0].split()[0]
refined_name = str_frac+" "+search_result[0][1].strip()
else:
refined_name = search_result[0][0]+" "+search_result[0][1].strip()
return refined_name
def product_list_of_movieId(list_of_movieName, useRefineSearch=False):
list_of_movie_id = []
for movieName in list_of_movieName:
print(movieName)
if useRefineSearch:
movieName = refine_search(movieName)
print("Refined Name: "+movieName)
if movieName in name_to_movieId.keys():
list_of_movie_id.append(str(name_to_movieId[movieName]))
print(str(name_to_movieId[movieName]))
return list_of_movie_id
def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
recommend_movie_ls = []
if positive_list:
positive_list = product_list_of_movieId(positive_list, useRefineSearch)
if negative_list:
negative_list = product_list_of_movieId(negative_list, useRefineSearch)
print('positive_list:', positive_list)
print('negative_list:', negative_list)
for movieId, prob in model.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
recommend_movie_ls.append(movieId)
return recommend_movie_ls
ls = recommender(positive_list=['Light It Up (1999)'], useRefineSearch=False, topn=5)
print('Recommendation Result based on "Up":')
display(df_movies[df_movies['movieId'].isin(ls)])
ls = recommender(positive_list=['Matrix, The (1999)'], negative_list=['Django Unchained (2012)'], useRefineSearch=False, topn=7)
print('Recommendation Result based on "The Matrix(1999)" minus "Django Unchained(2012)":')
display(df_movies[df_movies['movieId'].isin(ls)])
评估模型的效果
def user_liked_movies_builder(model, df, for_prediction=False):
df['liked'] = np.where(df['rating']>=4, 1, 0)
df['movieId'] = df['movieId'].astype('str')
df_liked = df[df['liked']==1]
if for_prediction:
df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]
user_liked_movies = df_liked.groupby('userId').agg({'movieId':lambda x:x.tolist()})['movieId'].to_dict()
return user_liked_movies
def scores_at_m(model, user_liked_movies_test, user_liked_movies_trainings, topn=10):
sum_liked = 0
sum_correct = 0
sum_total = 0
common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_trainings.keys()))
for userid in common_users:
current_test_set = set(user_liked_movies_test[userid])
pred = [pred_result[0] for pred_result in mode.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
sum_correct += len(set(pred).intersection(current_test_set))
sum_liked += len(current_test_set)
precision_at_m = sum_correct/(topn*len(common_users))
recall_at_m = sum_correct/sum_liked
f1 = 2 / ((1/precision_at_m)+(1/recall_at_m))
return [precision_at_m, recall_at_m, f1]
pd.options.mode.chained_assignment = None
model = Word2Vec.load('item2vec_20200823')
user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
user_liked_movies_test = user_liked_movies_builder(model, df_rating_test)
model_score_sg1 = score_at_m(model, user_liked_movies_test,user_liked_movies_train)
del model
print('Respectivly, the [precision, recall, F-1 score] at 10 for out model are:')
print(model_score_sg1)