Item2vec

最新推荐文章于 2024-01-14 10:25:40 发布

weixin_54096215

最新推荐文章于 2024-01-14 10:25:40 发布

阅读量355

点赞数 1

分类专栏： Embedding 文章标签： python

本文链接：https://blog.csdn.net/weixin_54096215/article/details/120778935

版权

Embedding 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

数据结构：

数据集：在参考链接里面，因为预训练时间太长，所以数据集换成ml-latest-small，取出里面的rating.csv和movie.csv。

链接：

https://github.com/rexrex9/kb4recMovielensDataProcess

注：每个文件夹下的orginal下面是原始数据。

代码：

index_2.py

# 参考链接： https://blog.csdn.net/fuzi2012/article/details/91345164
import  pandas as pd
import  numpy as np
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')

Id_title=pd.Series(df_movies.title.values,index=df_movies.movieId.values).to_dict()
Title_id=pd.Series(df_movies.movieId.values,index=df_movies.title).to_dict()
# print(type(Title_id))
# print(Title_id)

# index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Jeff', 'Ryan']
# obj = pd.Series([4, 7, -5, 3, 7, np.nan],index = index)
# print(obj)
for df in list((df_movies,df_ratings)):
    rand_idx=np.random.choice(len(df),5,replace=False)
# print(df.iloc[rand_idx,:])

import matplotlib.pyplot as plt
# import plotly.plotly as py

plt.figure(figsize=(8, 6))
ax = plt.subplot(111)
ax.set_title("Distribution of Movie Ratings", fontsize=16)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.xlabel("Movie Rating", fontsize=14)
plt.ylabel("Count", fontsize=14)

plt.hist(df_ratings['rating'], color="#3F5D7D")

# plt.show()

#划分数据集
from sklearn.model_selection import train_test_split

df_ratings_train, df_ratings_test= train_test_split(df_ratings,
                                                    stratify=df_ratings['userId'],
                                                    random_state = 15688,
                                                    test_size=0.30)

# print("Number of training data: "+str(len(df_ratings_train)))
# print("Number of test data: "+str(len(df_ratings_test)))
#评分>4,设为1，否则为0
def rating_splitter(df):
    df['liked']=np.where(df['rating']>=4,1,0)
    df['movieId']=df['movieId'].astype('str')#转换数组的类型
    gp_user_like=df.groupby(['liked','userId'])
    return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])
pd.options.mode.chained_assignment = None
splitted_movies =rating_splitter(df_ratings_train)
print(splitted_movies)

# 放入word2vec里面进行训练
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION>-1
import random
#将训练数据打乱
for movie_list in splitted_movies:
    random.shuffle(movie_list)

# 喂入模型，进行训练
from gensim.models import Word2Vec
import datetime
start = datetime.datetime.now()
#这个model得换一个名称，否则报错，所以将其注释
# model = Word2Vec(sentences = splitted_movies, # We will supply the pre-processed list of moive lists to this parameter
#                  iter = 5, # epoch
#                  min_count = 10, # a movie has to appear more than 10 times to be keeped
#                  # size = 200, # size of the hidden layer
#                  workers = 4, # specify the number of threads to be used for training
#                  sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
#                  # hs = 0, # Set to 0, as we are applying negative sampling.
#                  # negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
#                  window = 20)
#
# print("Time passed: " + str(datetime.datetime.now()-start))
# Word2Vec.save('item2vec_2021.h5')
model_w2v_sg = Word2Vec(sentences = splitted_movies,
                        iter = 10, # epoch
                        min_count = 5, # a movie has to appear more than 5 times to be keeped
                        size = 300, # size of the hidden layer
                        workers = 4, # specify the number of threads to be used for training
                        sg = 1,
                        hs = 0,
                        negative = 5,
                        window = 20)

print("Time passed: " + str(datetime.datetime.now()-start))
model_w2v_sg.save('item2vec_word2vecSg_2021')
# del model_w2v_sg

#加载模型
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import Word2Vec
model = Word2Vec.load('item2vec_word2vecSg_2021')
# word_vectors = model.wv
for key in model.wv.vocab:
    print(key) # 词
    print(model.wv.vocab[key])

model_2.py

from index_2 import Title_id,model_w2v_sg,df_movies,df_ratings_train, df_ratings_test
import requests
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
model = Word2Vec.load('E:\推荐系统\Embedding\item2vec\code\item2vec_word2vecSg_2021')
df_movies=pd.read_csv('../data2/movies.csv')
df_ratings=pd.read_csv('../data2/ratings.csv')

def refine_search(search_term):
    """
    Refine the movie name to be recognized by the recommender
    Args:
        search_term (string): Search Term

    Returns:
        refined_term (string): a name that can be search in the dataset
    """
    target_url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + "+".join(search_term.split()) + "&s=tt"
    html = requests.get(target_url).content
    parsed_html = BeautifulSoup(html, 'html.parser')
    for tag in parsed_html.find_all('td', class_="result_text"):
        search_result = re.findall('fn_tt_tt_1">(.*)</a>(.*)</td>', str(tag))
        if search_result:
            if search_result[0][0].split()[0] == "The":
                str_frac = " ".join(search_result[0][0].split()[1:]) + ", " + search_result[0][0].split()[0]
                refined_name = str_frac + " " + search_result[0][1].strip()
            else:
                refined_name = search_result[0][0] + " " + search_result[0][1].strip()
    return refined_name


def produce_list_of_movieId(list_of_movieName, useRefineSearch=False):
    """
    Turn a list of movie name into a list of movie ids. The movie names has to be exactly the same as they are in the dataset.
    Ambiguous movie names can be supplied if useRefineSearch is set to True

    Args:
        list_of_movieName (List): A list of movie names.
        useRefineSearch (boolean): Ambiguous movie names can be supplied if useRefineSearch is set to True

    Returns:
        list_of_movie_id (List of strings): A list of movie ids.
    """
    try:
        list_of_movie_id = []

        for movieName in list_of_movieName:
            if useRefineSearch:
                movieName = refine_search(movieName)
                print("Refined Name: " + movieName)
            if movieName in Title_id.keys():
                list_of_movie_id.append(str(Title_id[movieName]))
    except:
        produce_list_of_movieId(list_of_movieName, useRefineSearch=False)
    return list_of_movie_id


def recommender(positive_list=None, negative_list=None, useRefineSearch=False, topn=20):
    recommend_movie_ls = []
    if positive_list:
        positive_list = produce_list_of_movieId(positive_list, useRefineSearch)
    if negative_list:
        negative_list = produce_list_of_movieId(negative_list, useRefineSearch)
    for movieId, prob in model_w2v_sg.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
        recommend_movie_ls.append(movieId)
    return recommend_movie_ls

ls = recommender(positive_list=["Sabrina (1995)"], useRefineSearch=False, topn=5)
# print('Recommendation Result based on "Up (2009)":')
print(df_movies[df_movies['movieId'].isin(ls)])
#评估模型
def user_liked_movies_builder(model, df, for_prediction=False):
    df['liked'] = np.where(df['rating'] >= 4, 1, 0)
    df['movieId'] = df['movieId'].astype('str')
    df_liked = df[df['liked'] == 1]
    if for_prediction:
        df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]

    user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()

    return user_liked_movies


def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
    sum_liked = 0
    sum_correct = 0
    sum_total = 0
    common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))

    for userid in common_users:
        current_test_set = set(user_liked_movies_test[userid])
        pred = [pred_result[0] for pred_result in
                model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
        sum_correct += len(set(pred).intersection(current_test_set))
        sum_liked += len(current_test_set)
    precision_at_m = sum_correct / (topn * len(common_users))
    recall_at_m = sum_correct / sum_liked
    f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
    return [precision_at_m, recall_at_m, f1]

pd.options.mode.chained_assignment = None
user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)

model = Word2Vec.load('item2vec_word2vecSg_2021')
model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
del model

print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
print(model_score_sg1)

结果：

weixin_54096215

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
Item2vec

数据结构：数据集：在参考链接里面，因为预训练时间太长，所以数据集换成ml-latest-small，取出里面的rating.csv和movie.csv。链接：https://github.com/rexrex9/kb4recMovielensDataProcess注：每个文件夹下的orginal下面是原始数据。代码：index_2.py# 参考链接： https://blog.csdn.net/fuzi2012/article/details/91345164i...
复制链接

扫一扫