Item2vec

数据结构:

 

数据集:在参考链接里面,因为预训练时间太长,所以数据集换成ml-latest-small,取出里面的rating.csv和movie.csv。 

链接:

https://github.com/rexrex9/kb4recMovielensDataProcess

注:每个文件夹下的orginal下面是 原始数据。

代码: 

index_2.py

# 参考链接: https://blog.csdn.net/fuzi2012/article/details/91345164
import  pandas as pd
import  numpy as np
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')

Id_title=pd.Series(df_movies.title.values,index=df_movies.movieId.values).to_dict()
Title_id=pd.Series(df_movies.movieId.values,index=df_movies.title).to_dict()
# print(type(Title_id))
# print(Title_id)

# index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Jeff', 'Ryan']
# obj = pd.Series([4, 7, -5, 3, 7, np.nan],index = index)
# print(obj)
for df in list((df_movies,df_ratings)):
    rand_idx=np.random.choice(len(df),5,replace=False)
# print(df.iloc[rand_idx,:])

import matplotlib.pyplot as plt
# import plotly.plotly as py

plt.figure(figsize=(8, 6))
ax = plt.subplot(111)
ax.set_title("Distribution of Movie Ratings", fontsize=16)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.xlabel("Movie Rating", fontsize=14)
plt.ylabel("Count", fontsize=14)

plt.hist(df_ratings['rating'], color="#3F5D7D")

# plt.show()

#划分数据集
from sklearn.model_selection import train_test_split

df_ratings_train, df_ratings_test= train_test_split(df_ratings,
                                                    stratify=df_ratings['userId'],
                                                    random_state = 15688,
                                                    test_size=0.30)

# print("Number of training data: "+str(len(df_ratings_train)))
# print("Number of test data: "+str(len(df_ratings_test)))
#评分>4,设为1,否则为0
def rating_splitter(df):
    df['liked']=np.where(df['rating']>=4,1,0)
    df['movieId']=df['movieId'].astype('str')#转换数组的类型
    gp_user_like=df.groupby(['liked','userId'])
    return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
pd.options.mode.chained_assignment = None
splitted_movies =rating_splitter(df_ratings_train)
print(splitted_movies)

# 放入word2vec里面进行训练
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION>-1
import random
#将训练数据打乱
for movie_list in splitted_movies:
    random.shuffle(movie_list)

# 喂入模型,进行训练
from gensim.models import Word2Vec
import datetime
start = datetime.datetime.now()
#这个model得换一个名称,否则报错,所以将其注释
# model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
#                  iter = 5, # epoch
#                  min_count = 10, # a movie has to appear more than 10 times to be keeped
#                  # size = 200, # size of the hidden layer
#                  workers = 4, # specify the number of threads to be used for training
#                  sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
#                  # hs = 0, # Set to 0, as we are applying negative sampling.
#                  # negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
#                  window = 20)
#
# print("Time passed: " + str(datetime.datetime.now()-start))
# Word2Vec.save('item2vec_2021.h5')
model_w2v_sg = Word2Vec(sentences = splitted_movies,
                        iter = 10, # epoch
                        min_count = 5, # a movie has to appear more than 5 times to be keeped
                        size = 300, # size of the hidden layer
                        workers = 4, # specify the number of threads to be used for training
                        sg = 1,
                        hs = 0,
                        negative = 5,
                        window = 20)

print("Time passed: " + str(datetime.datetime.now()-start))
model_w2v_sg.save('item2vec_word2vecSg_2021')
# del model_w2v_sg

#加载模型
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import Word2Vec
model = Word2Vec.load('item2vec_word2vecSg_2021')
# word_vectors = model.wv
for key in model.wv.vocab:
    print(key) # 词
    print(model.wv.vocab[key])

model_2.py

from index_2 import Title_id,model_w2v_sg,df_movies,df_ratings_train, df_ratings_test
import requests
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
model = Word2Vec.load('E:\推荐系统\Embedding\item2vec\code\item2vec_word2vecSg_2021')
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')

def refine_search(search_term):
    """
    Refine the movie name to be recognized by the recommender
    Args:
        search_term (string): Search Term

    Returns:
        refined_term (string): a name that can be search in the dataset
    """
    target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + "+".join(search_term.split()) + "&s=tt"
    html = requests.get(target_url).content
    parsed_html = BeautifulSoup(html, 'html.parser')
    for tag in parsed_html.find_all('td', class_="result_text"):
        search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
        if search_result:
            if search_result[0][0].split()[0] == "The":
                str_frac = " ".join(search_result[0][0].split()[1:]) + ", " + search_result[0][0].split()[0]
                refined_name = str_frac + " " + search_result[0][1].strip()
            else:
                refined_name = search_result[0][0] + " " + search_result[0][1].strip()
    return refined_name


def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
    """
    Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
    Ambiguous movie names can be supplied if useRefineSearch is set to True

    Args:
        list_of_movieName (List): A list of movie names.
        useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True

    Returns:
        list_of_movie_id (List of strings): A list of movie ids.
    """
    try:
        list_of_movie_id = []

        for movieName in list_of_movieName:
            if useRefineSearch:
                movieName = refine_search(movieName)
                print("Refined Name: " + movieName)
            if movieName in Title_id.keys():
                list_of_movie_id.append(str(Title_id[movieName]))
    except:
        produce_list_of_movieId(list_of_movieName, useRefineSearch=False)
    return list_of_movie_id


def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
    recommend_movie_ls = []
    if positive_list:
        positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
    if negative_list:
        negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
    for movieId, prob in model_w2v_sg.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
        recommend_movie_ls.append(movieId)
    return recommend_movie_ls

ls = recommender(positive_list=["Sabrina (1995)"], useRefineSearch=False, topn=5)
# print('Recommendation Result based on "Up (2009)":')
print(df_movies[df_movies['movieId'].isin(ls)])
#评估模型
def user_liked_movies_builder(model, df, for_prediction=False):
    df['liked'] = np.where(df['rating'] >= 4, 1, 0)
    df['movieId'] = df['movieId'].astype('str')
    df_liked = df[df['liked'] == 1]
    if for_prediction:
        df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]

    user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()

    return user_liked_movies


def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
    sum_liked = 0
    sum_correct = 0
    sum_total = 0
    common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))

    for userid in common_users:
        current_test_set = set(user_liked_movies_test[userid])
        pred = [pred_result[0] for pred_result in
                model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
        sum_correct += len(set(pred).intersection(current_test_set))
        sum_liked += len(current_test_set)
    precision_at_m = sum_correct / (topn * len(common_users))
    recall_at_m = sum_correct / sum_liked
    f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
    return [precision_at_m, recall_at_m, f1]

pd.options.mode.chained_assignment = None
user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)

model = Word2Vec.load('item2vec_word2vecSg_2021')
model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
del model

print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
print(model_score_sg1)

结果:

 

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值