python实现词云（爬取豆瓣影评）

最新推荐文章于 2024-08-22 09:00:20 发布

小怪兽655

最新推荐文章于 2024-08-22 09:00:20 发布

阅读量2.1k

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/nyh670276894/article/details/107368576

版权

该程序是学完python之后的一个实战项目，通过分析网站的html，来爬取影评，并将其做成词云。

本程序主要分为3个过程。

1、抓取网页数据

使用Python爬虫技术获取豆瓣电影中最新上映电影的网页，其网址如下：

https://movie.douban.com/cinema/nowplaying/qingdao/

2、清理数据

通常将某部影评信息存入eachCommentList列表中。为便于数据清理和词频统计，把eachCommentList列表形成字符串comments，将comments字符串中的“也”“太”“ 的”等虚词（停用词）清理掉后进行词频统计。

3、用词云进行展示

最后使用词云包对影评信息进行词云展示。

# 分词包
import json
import jieba
import jieba.analyse
# numpy计算包
import numpy
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import requests_html
import requests
from bs4 import BeautifulSoup as bs
matplotlib.rcParams['figure.figsize']=(10.0, 5.0)
# 词云包
from wordcloud import WordCloud
import os
import tkinter as tk
from tkinter import *
from PIL import ImageTk, Image
#分词网页函数
def getNowPlayingMovie_list(url):
#    url = 'https://movie.douban.com/cinema/nowplaying/qingdao/'
    session = requests_html.HTMLSession()
    r = session.get(url)
    html_data = r.html.html
    soup= bs(html_data, 'html.parser')
    nowplaying_movie = soup.find_all('div', id='nowplaying')
    nowplaying_movie_list = nowplaying_movie[0].find_all('li', attrs={'class': 'list-item'})
    nowplaying_list = []
    for item in nowplaying_movie_list:
        nowplaying_dict = {}
        nowplaying_dict['id'] = item['data-subject']
        for tag_img_item in item.find_all('img'):
            nowplaying_dict['name'] = tag_img_item['alt']
            nowplaying_list.append(nowplaying_dict)
    return nowplaying_list

# 爬取评论函数
# 参数为电影id号和要爬取评论的页码
def getCommentsById(movieId,pageNum):
    eachCommentList = []
    if pageNum > 0:
        start=(pageNum-1)*20
    else:
        return False
    requrl = 'https://movie.douban.com/subject/' + movieId + '/comments?start='+str(start)+'&limit=20'
#    print(requrl)
    session = requests_html.HTMLSession()
    response = session.get(requrl)
    html_data = response.html.html
    soup = bs(html_data, 'html.parser')
    comment_div_list = soup.find_all('div', attrs={'class': 'comment'})
    for item in comment_div_list:
        # 获取p标签内部的span标签（即评论内容）
        b = item.find('p').find('span')
        if b.string is not None:
            # eachCommentList.append(item.find_all('p')[0].string)
            eachCommentList.append(b.string)
    return eachCommentList

def main():
    url = str(inp1.get())
    page_num = int(inp2.get())
    NowPlayingMovie_list=getNowPlayingMovie_list(url)
    # 前10页
    for t in range(len(NowPlayingMovie_list)):
        temp = 'https://movie.douban.com/subject/'+NowPlayingMovie_list[t]['id']+'/?from=playing_poster'
        session = requests_html.HTMLSession()
        r = session.get(temp)
        html_data = r.html.html
        # aaa = bs(html_data)
        # a = aaa.find('a', attrs={'href':'comments?sort=new_score&status=P'}).string
        # page_num = int(re.findall('\d+', a)[0])
        # print(page_num)
        commentList = []
        for i in range(page_num):
            num = i + 1
            commentList_temp = getCommentsById(NowPlayingMovie_list[t]['id'], num)
            commentList.append(commentList_temp)
        # 将列表中的数据转换为字符串
        comments = ''
        file = NowPlayingMovie_list[t]
        file_name = file.get("name")
        for k in range(len(commentList)):
            comments += (str(commentList[k])).strip()
        # 使用正则表达是去掉标点符号
        pattern = re.compile(r'[\u4e00-\u9fa5]')
        filterdata = re.findall(pattern, comments)
        cleaned_comments = ''.join(filterdata)
        # 使用jieba分词进行中文分词
        result = jieba.analyse.textrank(
            cleaned_comments,
            topK=200,
            withWeight=True
        )
        keywords = dict()
        for i in result:
            keywords[i[0]] = i[1]
        json_str = json.dumps(keywords, ensure_ascii=False)
        with open("out.txt", "a", encoding="utf-8") as fObj:
            fObj.write("删除停用词前：")
            fObj.write(json_str)
        with open("out.txt", "a")as file:
            file.write('\n')
 #       print("删除停用词前", keywords)
        # 停用词集合
        stop_words = []
        for line in open('stopword.txt', 'r', encoding='utf-8'):
            stop_words.append(line.rstrip('\n'))
        keywords = {x: keywords[x] for x in keywords if x not in stop_words}
        json_str = json.dumps(keywords, ensure_ascii=False)
        with open("out.txt", "a", encoding="utf-8") as fObj:
            fObj.write("删除停用词后：")
            fObj.write(json_str)
        with open("out.txt", "a")as file:
            file.write('\n')
        pic = np.array(Image.open('dol.jpg'))
  #      print('删除停用词后', keywords)
        # 使用词云显示
        wordcloud = WordCloud(
            scale=8,
            font_path='simhei.ttf',
            background_color='white',
            max_font_size=80,
            mask=pic,
            stopwords=stop_words,
        )
        word_frequence = keywords
        myword = wordcloud.fit_words(word_frequence)
        # 展示词云图

        plt.rcParams["font.sans-serif"] = ["SimHei"]
        plt.rcParams["axes.unicode_minus"] = False
        fig = plt.figure(t)
        plt.imshow(myword)
        plt.axis('off')
        plt.title('电影:《'+file_name+"》", size=26)
        plt.savefig(file_name+'.png')
        plt.draw()
        plt.pause(4)  # 间隔的秒数： 4s
        plt.close(fig)



# if __name__ == '__main__':
#     print(os.getcwd())
#     main()

root = Tk()
root.geometry('1000x600')
canvas = tk.Canvas(root, width=1000, height=600, bd=0, highlightthickness=0)
imgpath = 'D:\\123.jpg'
img = Image.open(imgpath)
photo = ImageTk.PhotoImage(img)
canvas.create_image(500, 240, image=photo)
canvas.pack()
entry = tk.Entry(root, insertbackground='blue', highlightthickness=2)
entry.pack()


root.title('电影词云搜索')

lb1 = Label(root, text='请输入要查找的链接:', bg = 'blue', font=('华文新魏', 15))
lb1.place(relx=0.23, rely=0.2, relwidth=0.25, relheight=0.05)
inp1 = Entry(root)
inp1.place(relx=0.25, rely=0.3, relwidth=0.2, relheight=0.07)

lb2 = Label(root, text='请输入要查找的页数:', bg = 'blue', font=('华文新魏', 15))
lb2.place(relx=0.53, rely=0.2, relwidth=0.25, relheight=0.05)
inp2 = Entry(root)
inp2.place(relx=0.55, rely=0.3, relwidth=0.2, relheight=0.07)

btn1 = Button(root, text='搜索', fg='red', bg='blue', font=('华文新魏', 20), command=main)
btn1.place(relx=0.45, rely=0.7, relwidth=0.1, relheight=0.1)



root.mainloop()