数据结构:
数据集:在参考链接里面,因为预训练时间太长,所以数据集换成ml-latest-small,取出里面的rating.csv和movie.csv。
链接:
https://github.com/rexrex9/kb4recMovielensDataProcess
注:每个文件夹下的orginal下面是 原始数据。
代码:
index_2.py
# 参考链接: https://blog.csdn.net/fuzi2012/article/details/91345164
import pandas as pd
import numpy as np
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')
Id_title=pd.Series(df_movies.title.values,index=df_movies.movieId.values).to_dict()
Title_id=pd.Series(df_movies.movieId.values,index=df_movies.title).to_dict()
# print(type(Title_id))
# print(Title_id)
# index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Jeff', 'Ryan']
# obj = pd.Series([4, 7, -5, 3, 7, np.nan],index = index)
# print(obj)
for df in list((df_movies,df_ratings)):
rand_idx=np.random.choice(len(df),5,replace=False)
# print(df.iloc[rand_idx,:])
import matplotlib.pyplot as plt
# import plotly.plotly as py
plt.figure(figsize=(8, 6))
ax = plt.subplot(111)
ax.set_title("Distribution of Movie Ratings", fontsize=16)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Movie Rating", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.hist(df_ratings['rating'], color="#3F5D7D")
# plt.show()
#划分数据集
from sklearn.model_selection import train_test_split
df_ratings_train, df_ratings_test= train_test_split(df_ratings,
stratify=df_ratings['userId'],
random_state = 15688,
test_size=0.30)
# print("Number of training data: "+str(len(df_ratings_train)))
# print("Number of test data: "+str(len(df_ratings_test)))
#评分>4,设为1,否则为0
def rating_splitter(df):
df['liked']=np.where(df['rating']>=4,1,0)
df['movieId']=df['movieId'].astype('str')#转换数组的类型
gp_user_like=df.groupby(['liked','userId'])
return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
pd.options.mode.chained_assignment = None
splitted_movies =rating_splitter(df_ratings_train)
print(splitted_movies)
# 放入word2vec里面进行训练
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION>-1
import random
#将训练数据打乱
for movie_list in splitted_movies:
random.shuffle(movie_list)
# 喂入模型,进行训练
from gensim.models import Word2Vec
import datetime
start = datetime.datetime.now()
#这个model得换一个名称,否则报错,所以将其注释
# model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
# iter = 5, # epoch
# min_count = 10, # a movie has to appear more than 10 times to be keeped
# # size = 200, # size of the hidden layer
# workers = 4, # specify the number of threads to be used for training
# sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
# # hs = 0, # Set to 0, as we are applying negative sampling.
# # negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
# window = 20)
#
# print("Time passed: " + str(datetime.datetime.now()-start))
# Word2Vec.save('item2vec_2021.h5')
model_w2v_sg = Word2Vec(sentences = splitted_movies,
iter = 10, # epoch
min_count = 5, # a movie has to appear more than 5 times to be keeped
size = 300, # size of the hidden layer
workers = 4, # specify the number of threads to be used for training
sg = 1,
hs = 0,
negative = 5,
window = 20)
print("Time passed: " + str(datetime.datetime.now()-start))
model_w2v_sg.save('item2vec_word2vecSg_2021')
# del model_w2v_sg
#加载模型
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models import Word2Vec
model = Word2Vec.load('item2vec_word2vecSg_2021')
# word_vectors = model.wv
for key in model.wv.vocab:
print(key) # 词
print(model.wv.vocab[key])
model_2.py
from index_2 import Title_id,model_w2v_sg,df_movies,df_ratings_train, df_ratings_test
import requests
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
model = Word2Vec.load('E:\推荐系统\Embedding\item2vec\code\item2vec_word2vecSg_2021')
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')
def refine_search(search_term):
"""
Refine the movie name to be recognized by the recommender
Args:
search_term (string): Search Term
Returns:
refined_term (string): a name that can be search in the dataset
"""
target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + "+".join(search_term.split()) + "&s=tt"
html = requests.get(target_url).content
parsed_html = BeautifulSoup(html, 'html.parser')
for tag in parsed_html.find_all('td', class_="result_text"):
search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
if search_result:
if search_result[0][0].split()[0] == "The":
str_frac = " ".join(search_result[0][0].split()[1:]) + ", " + search_result[0][0].split()[0]
refined_name = str_frac + " " + search_result[0][1].strip()
else:
refined_name = search_result[0][0] + " " + search_result[0][1].strip()
return refined_name
def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
"""
Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
Ambiguous movie names can be supplied if useRefineSearch is set to True
Args:
list_of_movieName (List): A list of movie names.
useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True
Returns:
list_of_movie_id (List of strings): A list of movie ids.
"""
try:
list_of_movie_id = []
for movieName in list_of_movieName:
if useRefineSearch:
movieName = refine_search(movieName)
print("Refined Name: " + movieName)
if movieName in Title_id.keys():
list_of_movie_id.append(str(Title_id[movieName]))
except:
produce_list_of_movieId(list_of_movieName, useRefineSearch=False)
return list_of_movie_id
def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
recommend_movie_ls = []
if positive_list:
positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
if negative_list:
negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
for movieId, prob in model_w2v_sg.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
recommend_movie_ls.append(movieId)
return recommend_movie_ls
ls = recommender(positive_list=["Sabrina (1995)"], useRefineSearch=False, topn=5)
# print('Recommendation Result based on "Up (2009)":')
print(df_movies[df_movies['movieId'].isin(ls)])
#评估模型
def user_liked_movies_builder(model, df, for_prediction=False):
df['liked'] = np.where(df['rating'] >= 4, 1, 0)
df['movieId'] = df['movieId'].astype('str')
df_liked = df[df['liked'] == 1]
if for_prediction:
df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]
user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()
return user_liked_movies
def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
sum_liked = 0
sum_correct = 0
sum_total = 0
common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))
for userid in common_users:
current_test_set = set(user_liked_movies_test[userid])
pred = [pred_result[0] for pred_result in
model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
sum_correct += len(set(pred).intersection(current_test_set))
sum_liked += len(current_test_set)
precision_at_m = sum_correct / (topn * len(common_users))
recall_at_m = sum_correct / sum_liked
f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
return [precision_at_m, recall_at_m, f1]
pd.options.mode.chained_assignment = None
user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)
model = Word2Vec.load('item2vec_word2vecSg_2021')
model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
del model
print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
print(model_score_sg1)
结果: