POST:基于内容的电影推荐系统-tdidf doc2vec文本相似推荐及热评分榜、cf模型推荐

本文介绍了一个基于内容的电影推荐系统,使用了movielens 10m数据集。主要内容包括:1) 电影热门排行榜统计;2) 使用CountVectorizer和TfidfVectorizer进行IDF相识度推荐;3) 应用doc2vec进行文本电影推荐。通过文本分词和TaggedDocument格式处理,实现个性化推荐。
摘要由CSDN通过智能技术生成

首先使用的数据是movielens(用的是10m大小的这份,用户100万数据打分):

1、关于电影hot排行榜统计

import pandas as pd

column_names = ['user_id', 'item_id', 'rating', 'timestamp']
links = pd.read_csv('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/ratings.dat',sep="::",names=column_names)


column_names1 = ['item_id', 'title', 'movietype']
movies = pd.read_csv('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/movies.dat',sep="::",names=column_names1)
movies.head(5)

df = pd.merge(links,movies, on="item_id")
df.head(5)


df = df.drop(columns=['timestamp'])
df.dropna()
df.shape


# # genres and their count

genre_labels = set()
for gen in df['movietype'].str.split('|').values:
    genre_labels = genre_labels.union(set(gen))
    
for x in genre_labels:
    print(x, len(df[df['movietype'].str.contains(x)].index))
    


# # top movies

top = df.groupby(['title'])['rating'].mean().sort_values(ascending=False)[:20]  # top 20 movies based on ratings


df.groupby(['title'])['rating'].mean()


2、idtdf相识度推荐

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

movie_types = set()

kk = []
a = open('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/movies.dat')
for i in a:
#     if linenum>5:
#         break
#     linenum +=1
#     print (i)
    iteam = i.strip().split('::')
    movieid,title,movietype = iteam[0],iteam[1],iteam[2]
    mm =''
    for j in movietype.split('|'):
        movie_types.add(j)
        mm += j+" "
    ss = {"movieid":movieid,"title":title,"movietype":mm}
    kk.append(ss)


movies = pd.DataFrame(kk)
def combine(x):
    return x['title'] + " " + x['movietype']
movies['Combined_Data'] = movies.apply(lambda x: combine(x),axis=1)


CountVectorizer与TfidfVectorizer,这两个类都是特征数值计算的常见方法。对于每一个训练文本,CountVectorizer只考虑每种词汇在该训练文本中出现的频率,而TfidfVectorizer除了考量某一词汇在当前训练文本中出现的频率之外,同时关注包含这个词汇的其它训练文本数目的倒数妙笔阁​​​​​​​​​​​​​​https://www.mbgtxt.com

#Using TFIDF
tf = TfidfVectorizer()
count_matrix = tf.fit_transform(movies["Combined_Data"])
cosine_sim_tf = cosine_similarity(count_matrix)
#cosine_sim_tf
user_movie = input('Pls enter your movie on which you want similar movies: ')
user_index = movies[movies.title == user_movie].index.values[0]
similar_movies = list(enumerate(cosine_sim_tf[user_index]))
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
#sorted_similar_movies

print("\nTop 10 similar movies to "+ user_movie +" are:\n")
for i in range(10):
    print((movies['title'][movies.index == (sorted_similar_movies[i][0]) ]).values[0])


# Using Count Vectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(movies["Combined_Data"])
cosine_sim = cosine_similarity(count_matrix)
#cosine_sim
user_movie = input('Pls enter your movie on which you want similar movies: ')
user_index = movies[movies.title == user_movie].index.values[0]
similar_movies = list(enumerate(cosine_sim[user_index]))
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]
#sorted_similar_movies

print("\nTop 10 similar movies to "+ user_movie +" are:\n")
for i in range(10):
    print((movies['title'][movies.index == (sorted_similar_movies[i][0]) ]).values[0])


3、doc2vec文本电影推荐
需要注意的是文本分词后的TaggedDocument加载需要的格式
 

# coding:utf-8
import jieba
import gensim
from gensim.models.doc2vec import Doc2Vec
import pandas as pd
TaggededDocument = gensim.models.doc2vec.TaggedDocument


def get_datasest():
    a = open('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/movies.dat')
    x_train = []
    df_title = []
    for num,i in enumerate(a):
    #     if linenum>5:
    #         break
    #     linenum +=1
    #     print (i)
        iteam = i.strip().split('::')
        movieid,title,movietype = iteam[0],iteam[1],iteam[2]
        df_title.append(title)
        mm =''
        for j in movietype.split('|'):
            
            mm += j+" "
        text = (title+" "+mm).replace("(","").replace(")","")
        word_list = ' '.join(jieba.cut(text)).split(' ')
#         print(word_list)
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()

        document = TaggededDocument(word_list, tags=[num])
        x_train.append(document)
    return x_train,pd.DataFrame({'Title':df_title})
        
def train(x_train, size=100, epoch_num=1): ##size 是你最终训练出的句子向量的维度,自己尝试着修改一下
 
    model_dm = Doc2Vec(x_train, min_count=1, window=5, size=size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
    model_dm.save('model_dm_wangyi1') ##模型保存的位置
 
    return model_dm

def test():
    model_dm = Doc2Vec.load("model_dm_wangyi1")
    test_text = ['Divide', '', '', 'and', '', '', 'Conquer', '', '', 'Why', '', '', 'We', '', '', 'Fight', ',', '', '', '3', '', '', '1943', '', '', 'Documentary', '', '', 'War']
    inferred_vector_dm = model_dm.infer_vector(test_text)
#     print (inferred_vector_dm)
    sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
 
 
    return sims

if __name__ == '__main__':
     x_train,df1 = get_datasest()
    model_dm = train(x_train)
#     print(x_train)
    sims = test()
    for count, sim in sims:
        print(df1.loc[int(count), "Title"].strip(),sim)4、cf模型als推荐:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pandas as pd

import pandas as pd
import os
os.environ["PYSPARK_PYTHON"]="/Users/lonng/opt/anaconda3/python.app/Contents/MacOS/python"

spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()


column_names = ['user_id', 'item_id', 'rating', 'timestamp']
links = pd.read_csv('/Users/lonng/Desktop/推荐学习/movie_rec/ml-10M100K/ratings.dat',sep="::",names=column_names)
links.head(5)
ratings = spark.createDataFrame(links.iloc[:100000,:])
(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating",
              coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

############直接模型预测和先计算出所有用户推荐结果再计算,后者更快
user_subset = ratings.where(ratings.user_id == 10)
user_subset_recs = model.recommendForUserSubset(user_subset, 10)
user_subset_recs.select("recommendations.item_id", "recommendations.rating").first()

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

userRecs.where(userRecs.user_id == 10).select("recommendations.item_id", "recommendations.rating").collect()
####################直接模型预测和先计算出所有用户推荐结果再计算,后者更快
item_subset = ratings.where(ratings.item_id == 2)
item_subset_recs = model.recommendForItemSubset(item_subset, 3)
item_subset_recs.select("recommendations.user_id", "recommendations.rating").first()

#Generate top 3 users recommendations for each movie
item_recs = model.recommendForAllItems(3)
item_recs.where(item_recs.item_id == 2)\
        .select("recommendations.user_id", "recommendations.rating").collect()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值