kaggle经典-电影推荐算法

yun呐

已于 2024-03-04 16:31:46 修改

阅读量378

点赞数 1

文章标签：推荐算法 python numpy

于 2024-03-04 16:30:55 首次发布

本文链接：https://blog.csdn.net/qq_42645279/article/details/136455918

版权

文章介绍了如何使用TF-IDF向量和余弦相似度构建一个电影推荐系统，包括数据预处理、计算相似度矩阵以及使用Surprise库进行协同过滤。还展示了如何结合导演、演员和类型等元数据进行个性化推荐。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

有许多经典的推荐方案，值得学习

import pandas as pd
import numpy as np
from pandas.io import json
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system
df1 = pd.read_csv('D:\\python\\dataset\\电影数据分析\\archive\\tmdb_5000_credits.csv')
df2 = pd.read_csv('D:\\python\\dataset\\电影数据分析\\archive\\tmdb_5000_movies.csv')

df1.columns = ['id', 'tittle', 'cast', 'crew']
df2 = df2.merge(df1, on='id')

# print(df2.head(10))
#
# C= df2['vote_average'].mean()
# print(C)
#
# m= df2['vote_count'].quantile(0.9)
# print(m)
#
# q_movies = df2.copy().loc[df2['vote_count'] >= m]
# print(q_movies.shape)
#
# def weighted_rating(x, m=m, C=C):
#     v = x['vote_count']
#     R = x['vote_average']
#     # Calculation based on the IMDB formula
#     return (v/(v+m) * R) + (m/(m+v) * C)
#
# q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
# q_movies = q_movies.sort_values(by=['score'],ascending=False)
# good = q_movies.head(10)['title']
# print(good.tolist())
# print(good)

# pop = df2.sort_values('popularity', ascending=False)
# plt.figure(figsize=(12, 4))
#
# plt.barh(pop['title'].head(6), pop['popularity'].head(6), align='center',
#          color='skyblue')
# # y轴逆序
# plt.gca().invert_yaxis()
# plt.xlabel("Popularity")
# plt.title("Popular Movies")
# plt.show()

ov = df2['overview'].head(5)
print(ov)

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

# Output the shape of tfidf_matrix
print(tfidf_matrix.shape)
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()


# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2['title'].iloc[movie_indices]


ts = get_recommendations('The Dark Knight Rises')

ta = get_recommendations('The Avengers')

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)


# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    # Return empty list in case of missing/malformed data
    return []


# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)
# Print the new features of the first 3 films
h3 = df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)
print(h3)


# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)


def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


df2['soup'] = df2.apply(create_soup, axis=1)

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])
print(count_matrix)
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# Reset index of our main DataFrame and construct reverse mapping as before
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])

re2 = get_recommendations('The Dark Knight Rises', cosine_sim2)
print(re2)
re3 = get_recommendations('The Godfather', cosine_sim2)
print(re3)

reader = Reader()
ratings = pd.read_csv('D:\\python\\dataset\\电影数据分析\\archive\\ratings_small.csv')
ratings.head()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
#data.split(n_folds=5)
svd = SVD()
# 新版本已经弃用  evaluate 改用cross_validate
cv = cross_validate(algo=svd,data=data,measures=['RMSE', 'MAE'],cv=5,verbose=True)
print(cv)
trainSet = data.build_full_trainset()
svd.fit(trainSet)
userId1 = ratings[ratings['userId'] == 1]
print(userId1)

result = svd.predict(1, 302, 3)
print(result)