前期准备
准备一张白底图片作为词云底板
说明:本代码爬取的是最近上映的电影,不同的日期运行此段代码可能会得到不同的结果
实现效果
代码实现
import warnings
warnings.filterwarnings("ignore")
import jieba
import numpy
import codecs
import re
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from urllib import request
import requests
from bs4 import BeautifulSoup as bs
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
# 分析网页函数
def getNowPlayingMovie_list():
url = 'https://movie.douban.com/nowplaying/shanghai/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
resp = requests.get(url=url, headers=headers)
# resp = request.urlopen('https://movie.douban.com/nowplaying/shanghai/')
# html_data = resp.read().decode('utf-8')
soup = bs(resp.text, 'html.parser')
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
return nowplaying_list
# 爬取评论函数
def getCommentsById(movieId, pageNum):
eachCommentList = []
if pageNum > 0:
start = (pageNum - 1) * 20
else:
return False
requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
print(requrl)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
# headers={
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
resp = requests.get(url=requrl, headers=headers)
# resp = request.urlopen(requrl)
# html_data = resp.read().decode('utf-8')
#print(html_data)
soup = bs(resp.text, 'html.parser')
comment_div_lits = soup.find_all('div', class_='comment')
#print(comment_div_lits)
for item in comment_div_lits:
#print(item)
# print(item.find_all('p')[0])
#print(type(item.find_all('p')[0]))
if item.find_all('p')[0] is not None:
eachCommentList.append(item.find_all('p')[0])
return eachCommentList
def main():
# 循环获取第一个电影的前10页评论
commentList = []
NowPlayingMovie_list = getNowPlayingMovie_list()
print(NowPlayingMovie_list)
for i in range(10):
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num)
# commentList_temp = getCommentsById('19899707', num)
# print(commentList_temp)
commentList.append(commentList_temp)
#print(commentList)
# 将列表中的数据转换为字符串
comments = ''
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()
print(comments)
# 使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
# print(cleaned_comments)
# 使用结巴分词进行中文分词
segment = jieba.lcut(cleaned_comments)
words_df = pd.DataFrame({'segment': segment})
# 去掉停用词
stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],encoding='gbk')
# print(stopwords)
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
print(words_df)
# 统计词频
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
# print(words_stat.head())
# 准备好一张白底图片
bg_pic = numpy.array(Image.open("love.jpg"))
# 用词云进行显示
wordcloud = WordCloud(
font_path="simhei.ttf",
background_color="white",
max_font_size=80,
width=2000,
height=1800,
mask=bg_pic,
mode="RGBA"
)
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
print(word_frequence)
"""
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
#print(word_frequence_list)
"""
wordcloud = wordcloud.fit_words(word_frequence)
image_colors = ImageColorGenerator(bg_pic)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file('show.jpg')
main()