电影推荐系统以及词云图绘制
字体下载网址:
https://ziyouziti.com/index-index-all.html
https://ziyouziti.com/index-index-list-type-1.html
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler # 标准化
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import tkinter
import jieba
import jieba.analyse
import jieba.posseg as pseg
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
"""
实现功能: 爬虫获得数据 数据预处理 分类 过滤
"""
def data_clear_movieinfo(movieinfo_data_path):
"""
数据清洗
:return:
"""
data = pd.read_csv(movieinfo_data_path)
datas = data.drop(columns=['detailUrl'])
print(datas)
# corpus 模拟语料库
corpus = ["second third document",
"second second document"]
tfvectorizer = CountVectorizer()
count_vector = tfvectorizer.fit_transform(corpus) # Tf 矩阵
transformer = TfidfTransformer() # 转换Tf矩阵
tfidf = transformer.fit_transform(count_vector) # 将TF转换成Tf-Idf
arr = tfidf.toarray()
print(arr)
# 也有一步到位的方法
# TF-IDF一步到位
# 训练整个语料库
tfidf = TfidfVectorizer(max_df=0.5, min_df=0.0003) # 可以不加参数,这里加参数是为了降维
tfidf.fit(corpus) # use vectorizer to fit the corpus
corpus_vector = tfidf.transform(corpus).toarray()
print(corpus_vector)
def data_clear_moviecomments(moviecomments_data_path):
"""
数据清洗
:return:
"""
data = pd.read_csv(moviecomments_data_path, error_bad_lines=False)
# print(data)
# corpus 模拟语料库
corpus = list(data['comments'])
# print(corpus)
# tf——idf方法一:
# tfvectorizer = CountVectorizer()
# count_vector = tfvectorizer.fit_transform(corpus) # Tf 矩阵
# transformer = TfidfTransformer() # 转换Tf矩阵
# tfidf = transformer.fit_transform(count_vector) # 将TF转换成Tf-Idf
# print(tfidf)
# arr = tfidf.toarray()
# print(arr)
# tf idf 方法二
tfidf2 = TfidfVectorizer()
re = tfidf2.fit_transform(corpus)
print(re)
# arr = re.toarray()
# print(arr)
# tf idf 方法三 以及词云图
text = open('./movie', encoding='utf-8').read()
text = jieba.cut(text, cut_all=False, HMM=True)
text = ' '.join(text)
keywords = jieba.analyse.extract_tags(sentence=text, topK=100, withWeight=True,
allowPOS=()) # sentence为提取的文本 topk 返回权重最大的tf-idf的关键词 withWeight是否一并返回关键词权重值 allowPOS仅包含指定词的词性 默认为空
print(keywords) # [('音轨', 0.66037652357), ('经典影片', 0.625719164545)]
img = Image.open('./sky.jpg')
print(img.size) # (550, 406)
img = img.resize((1000, 1000), Image.ANTIALIAS)
mask = np.array(img)
wc = WordCloud(font_path='./CELingDHJW.TTF', background_color="white", width=300, height=300, max_words=1000,
mask=mask, contour_color='steelblue', contour_width=2) # contour_color, contour_width 轮廓
md = dict()
for key, value in keywords:
md[key] = value
wc.generate_from_frequencies(md)
wc.to_file('电影评论词云图.png')
# image_colors = ImageColorGenerator(mask) # 词性图的颜色可以指定为和logo图片一样的颜色
# wc.recolor(color_func=image_colors)
# wc.to_file('电影评论词云图.png')
def data_clear_comments(moviecomments_data_path):
data = pd.read_csv(moviecomments_data_path, error_bad_lines=False)
# print(data)
# corpus 模拟语料库
comments_list = list(data['comments'])
username_list = list(data['userName'])
print(comments_list)
for index, comment in enumerate(comments_list):
# jiaba分词
text = jieba.cut(comment, cut_all=False, HMM=True)
text = ' '.join(text)
# 使用tf_idf统计词频和逆文本频率指数
keywords = jieba.analyse.extract_tags(sentence=text, topK=5, withWeight=False,
allowPOS=()) # sentence为提取的文本 topk 返回权重最大的tf-idf的关键词 withWeight是否一并返回关键词权重值 allowPOS仅包含指定词的词性 默认为空
tf_idf_result = username_list[index].split() + keywords
print(tf_idf_result)
tfidf2 = TfidfVectorizer()
re = tfidf2.fit_transform(comments_list)
print(re)
arr = re.toarray()
print(arr)
# 对得到的特征向量数据进行标准化处理
# 标准化方式1
new_data1 = (arr - arr.mean()) / arr.std()
print(new_data1)
# 将得到的特征向量聚类
kmodel = KMeans(n_clusters=4, max_iter=15, n_jobs=4, random_state=0)
kmodel.fit(new_data1)
# 得到类别
category = kmodel.labels_
print(category)
# 将聚类结果类别可视化
plt.scatter(range(len(category)), list(category))
plt.title('聚类结果类别可视化')
plt.savefig('./聚类结果类别可视化.png')
plt.show()
# 合并数据并导出
final_moviecomment = pd.concat([data, pd.DataFrame(category, index=data.index)], axis=1)
# 将添加的列重新命名
final_moviecomment.columns = list(data.columns) + ['category']
print(final_moviecomment)
# 导出数据
# final_movieinfo.to_csv('./final_moviecomment.csv', index=False,header=True,encoding='utf-8')
"""
movieinfo操作
"""
movieinfo_data = pd.read_csv('./maoyan_movie.csv')
movieinfo_datas = movieinfo_data.drop(columns=['detailUrl'])
movieinfo_datas = movieinfo_datas.iloc[1:, :]
print(movieinfo_datas)
tfidf2 = TfidfVectorizer()
# print(111111111111111111111111)
# print(movieinfo_datas['movieName'].to_list())
# print(movieinfo_datas['movieName'].tolist())
# print(list(movieinfo_datas['movieName']))
re = tfidf2.fit_transform(list(movieinfo_datas['movieName'].to_list()))
print(re)
arr = re.toarray()
print(arr)
# 对得到的特征向量数据进行标准化处理
# 标准化方式1
new_data2 = (arr - arr.mean()) / arr.std()
print(new_data1)
# 将得到的特征向量聚类
kmodel1 = KMeans(n_clusters=4, max_iter=15, n_jobs=4, random_state=0)
kmodel1.fit(new_data2)
# 得到类别
category = kmodel1.labels_
print(category)
# 合并数据并导出
final_movieinfo_datas = pd.concat([movieinfo_datas, pd.DataFrame(category, index=movieinfo_datas.index)], axis=1)
# 将添加的列重新命名
final_movieinfo_datas.columns = list(movieinfo_datas.columns) + ['category']
print(final_movieinfo_datas)
# 导出数据
# final_movieinfo_datas.to_csv('./final_movieinfo_datas.csv', index=False, header=True, encoding='utf-8')
"""
过滤
"""
# 过滤掉25%左右的用户
total_score_count = sum(final_moviecomment.score)
print(float(final_moviecomment.head(n=13).score.sum() / total_score_count) * 100)
# 去除过滤后的用户数据和电影数据
user_subset = list(final_moviecomment.head(13).userName)
movie_subset = list(final_movieinfo_datas.head(13).movieName)
# print(user_subset)
# print(movie_subset)
# 过滤掉其他用户和其他电影数据
moviecomment_dataset = pd.read_csv('./final_moviecomment.csv')
print(moviecomment_dataset)
User_dataset_sub = moviecomment_dataset[moviecomment_dataset.userName.isin(user_subset)]
print(User_dataset_sub)
del (moviecomment_dataset)
User_dataset_sub.to_csv('./User_dataset_sub.csv', index=False)
movieinfo_dataset = pd.read_csv('./final_movieinfo_datas.csv')
print(movieinfo_dataset)
Movieinfo_dataset_sub = movieinfo_dataset[movieinfo_dataset.movieName.isin(movie_subset)]
print(Movieinfo_dataset_sub)
del (movieinfo_dataset)
Movieinfo_dataset_sub.to_csv('./Movieinfo_dataset_sub.csv', index=False)
"""
相似度推荐
"""
category00 = []
category01 = []
category02 = []
category03 = []
for index, row in final_movieinfo_datas.iterrows():
if row['category'] == 0:
category00.append(row['movieName'])
elif row['category'] == 1:
category01.append(row['movieName'])
elif row['category'] == 2:
category02.append(row['movieName'])
else:
category03.append(row['movieName'])
new_dict0 = {}
new_dict1 = {}
new_dict2 = {}
new_dict3 = {}
category000 = []
category001 = []
category002 = []
category003 = []
for index, row in final_moviecomment.iterrows():
if row['category'] == 0:
new_dict0[row['userName']] = random.sample(category00, 3)[0:3]
category000.append(new_dict0)
elif row['category'] == 1:
new_dict1[row['userName']] = random.sample(category01, 3)[0:3]
category001.append(new_dict1)
else:
li = ['黑客帝国3:矩阵革命', '黑客帝国', 'V字仇杀队', '本杰明·巴顿奇事']
[category03.append(i) for i in category03 if len(category03) < 3]
category03 = category03 + random.choice(li).split()
new_dict3[row['userName']] = random.sample(category03, 3)[0:3]
category003.append(new_dict3)
category_dic = {**category000[0], **category001[0], **category003[0]}
# print(category_dic)
df = pd.DataFrame(category_dic).T
# print(df)
df_head = df.head()
df_tail = df.tail(8)
new_df = pd.concat([df_head, df_tail], axis=0)
new_df.columns = ['movie_1', 'movie_2', 'movie_3']
# print(new_df)
root = tkinter.Tk()
label = tkinter.Label(root, text=new_df) # 生成标签
label.pack() # 将标签添加到主窗口
root.title('Movie recommendation system')
root.geometry('550x500')
root.mainloop()
def main():
"""
主函数
:return:
"""
# 电影详细 数据清洗
# movieinfo_data_path = './maoyan_movie.csv'
# movieinfo_data = data_clear_movieinfo(movieinfo_data_path)
moviecomments_data_path = './maoyan_comments.csv'
# 电影评论相关 词云图制作
data_clear_moviecomments(moviecomments_data_path)
# 电影评论数据清洗以及tf_idf 计算应用
data_clear_comments(moviecomments_data_path)
if __name__ == '__main__':
main()