python电影名称词云_Python爬取最近上映的电影评论并生成词云——误杀

最新推荐文章于 2021-06-18 15:57:04 发布

weixin_39866974

最新推荐文章于 2021-06-18 15:57:04 发布

阅读量317

点赞数

文章标签： python电影名称词云

前期准备

准备一张白底图片作为词云底板

说明：本代码爬取的是最近上映的电影，不同的日期运行此段代码可能会得到不同的结果

实现效果

代码实现

import warnings

warnings.filterwarnings("ignore")

import jieba

import numpy

import codecs

import re

import pandas as pd

import matplotlib.pyplot as plt

from PIL import Image

from urllib import request

import requests

from bs4 import BeautifulSoup as bs

from wordcloud import WordCloud,ImageColorGenerator

import matplotlib

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

# 分析网页函数

def getNowPlayingMovie_list():

url = 'https://movie.douban.com/nowplaying/shanghai/'

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}

resp = requests.get(url=url, headers=headers)

# resp = request.urlopen('https://movie.douban.com/nowplaying/shanghai/')

# html_data = resp.read().decode('utf-8')

soup = bs(resp.text, 'html.parser')

nowplaying_movie = soup.find_all('div', id='nowplaying')

nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

nowplaying_list = []

for item in nowplaying_movie_list:

nowplaying_dict = {}

nowplaying_dict['id'] = item['data-subject']

for tag_img_item in item.find_all('img'):

nowplaying_dict['name'] = tag_img_item['alt']

nowplaying_list.append(nowplaying_dict)

return nowplaying_list

# 爬取评论函数

def getCommentsById(movieId, pageNum):

eachCommentList = []

if pageNum > 0:

start = (pageNum - 1) * 20

else:

return False

requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'

print(requrl)

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}

# headers={

# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}

resp = requests.get(url=requrl, headers=headers)

# resp = request.urlopen(requrl)

# html_data = resp.read().decode('utf-8')

#print(html_data)

soup = bs(resp.text, 'html.parser')

comment_div_lits = soup.find_all('div', class_='comment')

#print(comment_div_lits)

for item in comment_div_lits:

#print(item)

# print(item.find_all('p')[0])

#print(type(item.find_all('p')[0]))

if item.find_all('p')[0] is not None:

eachCommentList.append(item.find_all('p')[0])

return eachCommentList

def main():

# 循环获取第一个电影的前10页评论

commentList = []

NowPlayingMovie_list = getNowPlayingMovie_list()

print(NowPlayingMovie_list)

for i in range(10):

num = i + 1

commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num)

# commentList_temp = getCommentsById('19899707', num)

# print(commentList_temp)

commentList.append(commentList_temp)

#print(commentList)

# 将列表中的数据转换为字符串

comments = ''

for k in range(len(commentList)):

comments = comments + (str(commentList[k])).strip()

print(comments)

# 使用正则表达式去除标点符号

pattern = re.compile(r'[\u4e00-\u9fa5]+')

filterdata = re.findall(pattern, comments)

cleaned_comments = ''.join(filterdata)

# print(cleaned_comments)

# 使用结巴分词进行中文分词

segment = jieba.lcut(cleaned_comments)

words_df = pd.DataFrame({'segment': segment})

# 去掉停用词

stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],encoding='gbk')

# print(stopwords)

words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

print(words_df)

# 统计词频

words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})

words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)

# print(words_stat.head())

# 准备好一张白底图片

bg_pic = numpy.array(Image.open("love.jpg"))

# 用词云进行显示

wordcloud = WordCloud(

font_path="simhei.ttf",

background_color="white",

max_font_size=80,

width=2000,

height=1800,

mask=bg_pic,

mode="RGBA"

)

word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}

print(word_frequence)

"""

word_frequence_list = []

for key in word_frequence:

temp = (key, word_frequence[key])

word_frequence_list.append(temp)

#print(word_frequence_list)

"""

wordcloud = wordcloud.fit_words(word_frequence)

image_colors = ImageColorGenerator(bg_pic)

plt.imshow(wordcloud)

plt.axis("off")

plt.show()

wordcloud.to_file('show.jpg')

main()