Python电影推荐系统以及词云图绘制

BRYTLEVSON
已于 2022-06-09 16:11:03 修改
阅读量625
点赞数
分类专栏： python 文章标签： python 机器学习数据挖掘
于 2020-09-08 09:07:57 首次发布
本文链接：https://blog.csdn.net/brytlevson/article/details/107024134
版权
python 专栏收录该内容
48 篇文章 6 订阅
订阅专栏
电影推荐系统以及词云图绘制
字体下载网址：
https://ziyouziti.com/index-index-all.html
https://ziyouziti.com/index-index-list-type-1.html
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler  # 标准化
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import tkinter
import jieba
import jieba.analyse
import jieba.posseg as pseg
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import warnings

warnings.filterwarnings("ignore")

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
"""
实现功能： 爬虫获得数据  数据预处理   分类   过滤
"""


def data_clear_movieinfo(movieinfo_data_path):
    """
    数据清洗
    :return:
    """
    data = pd.read_csv(movieinfo_data_path)
    datas = data.drop(columns=['detailUrl'])
    print(datas)

    # corpus 模拟语料库
    corpus = ["second third document",
              "second second document"]
    tfvectorizer = CountVectorizer()
    count_vector = tfvectorizer.fit_transform(corpus)  # Tf 矩阵
    transformer = TfidfTransformer()  # 转换Tf矩阵
    tfidf = transformer.fit_transform(count_vector)  # 将TF转换成Tf-Idf
    arr = tfidf.toarray()
    print(arr)

    #  也有一步到位的方法
    # TF-IDF一步到位
    # 训练整个语料库
    tfidf = TfidfVectorizer(max_df=0.5, min_df=0.0003)  # 可以不加参数，这里加参数是为了降维
    tfidf.fit(corpus)  # use vectorizer to fit the corpus
    corpus_vector = tfidf.transform(corpus).toarray()
    print(corpus_vector)


def data_clear_moviecomments(moviecomments_data_path):
    """
    数据清洗
    :return:
    """

    data = pd.read_csv(moviecomments_data_path, error_bad_lines=False)
    # print(data)
    # corpus 模拟语料库
    corpus = list(data['comments'])
    # print(corpus)

    #  tf——idf方法一：
    # tfvectorizer = CountVectorizer()
    # count_vector = tfvectorizer.fit_transform(corpus)  # Tf 矩阵
    # transformer = TfidfTransformer()  # 转换Tf矩阵
    # tfidf = transformer.fit_transform(count_vector)  # 将TF转换成Tf-Idf
    # print(tfidf)
    # arr = tfidf.toarray()
    # print(arr)

    #  tf idf 方法二
    tfidf2 = TfidfVectorizer()
    re = tfidf2.fit_transform(corpus)
    print(re)
    # arr = re.toarray()
    # print(arr)

    #  tf idf 方法三  以及词云图

    text = open('./movie', encoding='utf-8').read()
    text = jieba.cut(text, cut_all=False, HMM=True)
    text = ' '.join(text)
    keywords = jieba.analyse.extract_tags(sentence=text, topK=100, withWeight=True,
                                          allowPOS=())  # sentence为提取的文本 topk 返回权重最大的tf-idf的关键词 withWeight是否一并返回关键词权重值   allowPOS仅包含指定词的词性 默认为空
    print(keywords)  # [('音轨', 0.66037652357), ('经典影片', 0.625719164545)]

    img = Image.open('./sky.jpg')
    print(img.size)  # (550, 406)
    img = img.resize((1000, 1000), Image.ANTIALIAS)
    mask = np.array(img)
    wc = WordCloud(font_path='./CELingDHJW.TTF', background_color="white", width=300, height=300, max_words=1000,
                   mask=mask, contour_color='steelblue', contour_width=2)  # contour_color, contour_width  轮廓
    md = dict()
    for key, value in keywords:
        md[key] = value
    wc.generate_from_frequencies(md)
    wc.to_file('电影评论词云图.png')
    # image_colors = ImageColorGenerator(mask)  # 词性图的颜色可以指定为和logo图片一样的颜色
    # wc.recolor(color_func=image_colors)
    # wc.to_file('电影评论词云图.png')


def data_clear_comments(moviecomments_data_path):
    data = pd.read_csv(moviecomments_data_path, error_bad_lines=False)
    # print(data)
    # corpus 模拟语料库
    comments_list = list(data['comments'])
    username_list = list(data['userName'])
    print(comments_list)
    for index, comment in enumerate(comments_list):
        #  jiaba分词
        text = jieba.cut(comment, cut_all=False, HMM=True)
        text = ' '.join(text)
        #  使用tf_idf统计词频和逆文本频率指数
        keywords = jieba.analyse.extract_tags(sentence=text, topK=5, withWeight=False,
                                              allowPOS=())  # sentence为提取的文本 topk 返回权重最大的tf-idf的关键词 withWeight是否一并返回关键词权重值   allowPOS仅包含指定词的词性 默认为空
        tf_idf_result = username_list[index].split() + keywords
        print(tf_idf_result)

    tfidf2 = TfidfVectorizer()
    re = tfidf2.fit_transform(comments_list)
    print(re)
    arr = re.toarray()
    print(arr)

    #  对得到的特征向量数据进行标准化处理
    #  标准化方式1
    new_data1 = (arr - arr.mean()) / arr.std()
    print(new_data1)
    #  将得到的特征向量聚类
    kmodel = KMeans(n_clusters=4, max_iter=15, n_jobs=4, random_state=0)
    kmodel.fit(new_data1)

    # 得到类别
    category = kmodel.labels_
    print(category)

    #  将聚类结果类别可视化
    plt.scatter(range(len(category)), list(category))
    plt.title('聚类结果类别可视化')
    plt.savefig('./聚类结果类别可视化.png')
    plt.show()

    # 合并数据并导出
    final_moviecomment = pd.concat([data, pd.DataFrame(category, index=data.index)], axis=1)
    # 将添加的列重新命名
    final_moviecomment.columns = list(data.columns) + ['category']
    print(final_moviecomment)

    #  导出数据
    # final_movieinfo.to_csv('./final_moviecomment.csv', index=False,header=True,encoding='utf-8')

    """
    movieinfo操作
    """
    movieinfo_data = pd.read_csv('./maoyan_movie.csv')
    movieinfo_datas = movieinfo_data.drop(columns=['detailUrl'])
    movieinfo_datas = movieinfo_datas.iloc[1:, :]
    print(movieinfo_datas)

    tfidf2 = TfidfVectorizer()
    # print(111111111111111111111111)
    # print(movieinfo_datas['movieName'].to_list())
    # print(movieinfo_datas['movieName'].tolist())
    # print(list(movieinfo_datas['movieName']))
    re = tfidf2.fit_transform(list(movieinfo_datas['movieName'].to_list()))
    print(re)
    arr = re.toarray()
    print(arr)

    #  对得到的特征向量数据进行标准化处理
    #  标准化方式1
    new_data2 = (arr - arr.mean()) / arr.std()
    print(new_data1)
    #  将得到的特征向量聚类
    kmodel1 = KMeans(n_clusters=4, max_iter=15, n_jobs=4, random_state=0)
    kmodel1.fit(new_data2)

    # 得到类别
    category = kmodel1.labels_
    print(category)

    # 合并数据并导出
    final_movieinfo_datas = pd.concat([movieinfo_datas, pd.DataFrame(category, index=movieinfo_datas.index)], axis=1)
    # 将添加的列重新命名
    final_movieinfo_datas.columns = list(movieinfo_datas.columns) + ['category']
    print(final_movieinfo_datas)

    #  导出数据
    # final_movieinfo_datas.to_csv('./final_movieinfo_datas.csv', index=False, header=True, encoding='utf-8')

    """
    过滤
    """

    #   过滤掉25%左右的用户
    total_score_count = sum(final_moviecomment.score)
    print(float(final_moviecomment.head(n=13).score.sum() / total_score_count) * 100)

    # 去除过滤后的用户数据和电影数据
    user_subset = list(final_moviecomment.head(13).userName)
    movie_subset = list(final_movieinfo_datas.head(13).movieName)
    # print(user_subset)
    # print(movie_subset)

    #  过滤掉其他用户和其他电影数据
    moviecomment_dataset = pd.read_csv('./final_moviecomment.csv')
    print(moviecomment_dataset)
    User_dataset_sub = moviecomment_dataset[moviecomment_dataset.userName.isin(user_subset)]
    print(User_dataset_sub)
    del (moviecomment_dataset)
    User_dataset_sub.to_csv('./User_dataset_sub.csv', index=False)

    movieinfo_dataset = pd.read_csv('./final_movieinfo_datas.csv')
    print(movieinfo_dataset)
    Movieinfo_dataset_sub = movieinfo_dataset[movieinfo_dataset.movieName.isin(movie_subset)]
    print(Movieinfo_dataset_sub)
    del (movieinfo_dataset)
    Movieinfo_dataset_sub.to_csv('./Movieinfo_dataset_sub.csv', index=False)

    """
    相似度推荐
    """
    category00 = []
    category01 = []
    category02 = []
    category03 = []
    for index, row in final_movieinfo_datas.iterrows():
        if row['category'] == 0:
            category00.append(row['movieName'])
        elif row['category'] == 1:
            category01.append(row['movieName'])
        elif row['category'] == 2:
            category02.append(row['movieName'])
        else:
            category03.append(row['movieName'])

    new_dict0 = {}
    new_dict1 = {}
    new_dict2 = {}
    new_dict3 = {}
    category000 = []
    category001 = []
    category002 = []
    category003 = []
    for index, row in final_moviecomment.iterrows():
        if row['category'] == 0:
            new_dict0[row['userName']] = random.sample(category00, 3)[0:3]
            category000.append(new_dict0)
        elif row['category'] == 1:
            new_dict1[row['userName']] = random.sample(category01, 3)[0:3]
            category001.append(new_dict1)
        else:
            li = ['黑客帝国3：矩阵革命', '黑客帝国', 'V字仇杀队', '本杰明·巴顿奇事']
            [category03.append(i) for i in category03 if len(category03) < 3]
            category03 = category03 + random.choice(li).split()
            new_dict3[row['userName']] = random.sample(category03, 3)[0:3]
            category003.append(new_dict3)
    category_dic = {**category000[0], **category001[0],  **category003[0]}
    # print(category_dic)

    df = pd.DataFrame(category_dic).T
    # print(df)

    df_head = df.head()
    df_tail = df.tail(8)
    new_df = pd.concat([df_head, df_tail], axis=0)
    new_df.columns = ['movie_1', 'movie_2', 'movie_3']
    # print(new_df)

    root = tkinter.Tk()
    label = tkinter.Label(root, text=new_df)  # 生成标签
    label.pack()  # 将标签添加到主窗口
    root.title('Movie recommendation system')
    root.geometry('550x500')

    root.mainloop()


def main():
    """
    主函数
    :return:
    """
    #  电影详细 数据清洗
    # movieinfo_data_path = './maoyan_movie.csv'
    # movieinfo_data = data_clear_movieinfo(movieinfo_data_path)

    moviecomments_data_path = './maoyan_comments.csv'

    #  电影评论相关 词云图制作
    data_clear_moviecomments(moviecomments_data_path)

    #  电影评论数据清洗以及tf_idf 计算应用
    data_clear_comments(moviecomments_data_path)


if __name__ == '__main__':
    main()