该程序是学完python之后的一个实战项目,通过分析网站的html,来爬取影评,并将其做成词云。
本程序主要分为3个过程。
1、抓取网页数据
使用Python爬虫技术获取豆瓣电影中最新上映电影的网页,其网址如下:
https://movie.douban.com/cinema/nowplaying/qingdao/
2、清理数据
通常将某部影评信息存入eachCommentList列表中。为便于数据清理和词频统计,把eachCommentList列表形成字符串comments,将comments字符串中的“也”“太”“ 的”等虚词(停用词)清理掉后进行词频统计。
3、用词云进行展示
最后使用词云包对影评信息进行词云展示。
# 分词包
import json
import jieba
import jieba.analyse
# numpy计算包
import numpy
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import requests_html
import requests
from bs4 import BeautifulSoup as bs
matplotlib.rcParams['figure.figsize']=(10.0, 5.0)
# 词云包
from wordcloud import WordCloud
import os
import tkinter as tk
from tkinter import *
from PIL import ImageTk, Image
#分词网页函数
def getNowPlayingMovie_list(url):
# url = 'https://movie.douban.com/cinema/nowplaying/qingdao/'
session = requests_html.HTMLSession()
r = session.get(url)
html_data = r.html.html
soup= bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', attrs={'class': 'list-item'})
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
return nowplaying_list
# 爬取评论函数
# 参数为电影id号和要爬取评论的页码
def getCommentsById(movieId,pageNum):
eachCommentList = []
if pageNum > 0:
start=(pageNum-1)*20
else:
return False
requrl = 'https://movie.douban.com/subject/' + movieId + '/comments?start='+str(start)+'&limit=20'
# print(requrl)
session = requests_html.HTMLSession()
response = session.get(requrl)
html_data = response.html.html
soup = bs(html_data, 'html.parser')
comment_div_list = soup.find_all('div', attrs={'class': 'comment'})
for item in comment_div_list:
# 获取p标签内部的span标签(即评论内容)
b = item.find('p').find('span')
if b.string is not None:
# eachCommentList.append(item.find_all('p')[0].string)
eachCommentList.append(b.string)
return eachCommentList
def main():
url = str(inp1.get())
page_num = int(inp2.get())
NowPlayingMovie_list=getNowPlayingMovie_list(url)
# 前10页
for t in range(len(NowPlayingMovie_list)):
temp = 'https://movie.douban.com/subject/'+NowPlayingMovie_list[t]['id']+'/?from=playing_poster'
session = requests_html.HTMLSession()
r = session.get(temp)
html_data = r.html.html
# aaa = bs(html_data)
# a = aaa.find('a', attrs={'href':'comments?sort=new_score&status=P'}).string
# page_num = int(re.findall('\d+', a)[0])
# print(page_num)
commentList = []
for i in range(page_num):
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[t]['id'], num)
commentList.append(commentList_temp)
# 将列表中的数据转换为字符串
comments = ''
file = NowPlayingMovie_list[t]
file_name = file.get("name")
for k in range(len(commentList)):
comments += (str(commentList[k])).strip()
# 使用正则表达是去掉标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
# 使用jieba分词进行中文分词
result = jieba.analyse.textrank(
cleaned_comments,
topK=200,
withWeight=True
)
keywords = dict()
for i in result:
keywords[i[0]] = i[1]
json_str = json.dumps(keywords, ensure_ascii=False)
with open("out.txt", "a", encoding="utf-8") as fObj:
fObj.write("删除停用词前:")
fObj.write(json_str)
with open("out.txt", "a")as file:
file.write('\n')
# print("删除停用词前", keywords)
# 停用词集合
stop_words = []
for line in open('stopword.txt', 'r', encoding='utf-8'):
stop_words.append(line.rstrip('\n'))
keywords = {x: keywords[x] for x in keywords if x not in stop_words}
json_str = json.dumps(keywords, ensure_ascii=False)
with open("out.txt", "a", encoding="utf-8") as fObj:
fObj.write("删除停用词后:")
fObj.write(json_str)
with open("out.txt", "a")as file:
file.write('\n')
pic = np.array(Image.open('dol.jpg'))
# print('删除停用词后', keywords)
# 使用词云显示
wordcloud = WordCloud(
scale=8,
font_path='simhei.ttf',
background_color='white',
max_font_size=80,
mask=pic,
stopwords=stop_words,
)
word_frequence = keywords
myword = wordcloud.fit_words(word_frequence)
# 展示词云图
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
fig = plt.figure(t)
plt.imshow(myword)
plt.axis('off')
plt.title('电影:《'+file_name+"》", size=26)
plt.savefig(file_name+'.png')
plt.draw()
plt.pause(4) # 间隔的秒数: 4s
plt.close(fig)
# if __name__ == '__main__':
# print(os.getcwd())
# main()
root = Tk()
root.geometry('1000x600')
canvas = tk.Canvas(root, width=1000, height=600, bd=0, highlightthickness=0)
imgpath = 'D:\\123.jpg'
img = Image.open(imgpath)
photo = ImageTk.PhotoImage(img)
canvas.create_image(500, 240, image=photo)
canvas.pack()
entry = tk.Entry(root, insertbackground='blue', highlightthickness=2)
entry.pack()
root.title('电影词云搜索')
lb1 = Label(root, text='请输入要查找的链接:', bg = 'blue', font=('华文新魏', 15))
lb1.place(relx=0.23, rely=0.2, relwidth=0.25, relheight=0.05)
inp1 = Entry(root)
inp1.place(relx=0.25, rely=0.3, relwidth=0.2, relheight=0.07)
lb2 = Label(root, text='请输入要查找的页数:', bg = 'blue', font=('华文新魏', 15))
lb2.place(relx=0.53, rely=0.2, relwidth=0.25, relheight=0.05)
inp2 = Entry(root)
inp2.place(relx=0.55, rely=0.3, relwidth=0.2, relheight=0.07)
btn1 = Button(root, text='搜索', fg='red', bg='blue', font=('华文新魏', 20), command=main)
btn1.place(relx=0.45, rely=0.7, relwidth=0.1, relheight=0.1)
root.mainloop()
效果图:(输入的链接是一定的,如果想输入其他链接,需要再分析网页的html)这是爬取到的某一个影评:(模板有点小了,可以再换一下模板)