python电影名称词云_Python爬取最近上映的电影评论并生成词云——误杀

前期准备

准备一张白底图片作为词云底板

说明:本代码爬取的是最近上映的电影,不同的日期运行此段代码可能会得到不同的结果

实现效果

代码实现

import warnings

warnings.filterwarnings("ignore")

import jieba

import numpy

import codecs

import re

import pandas as pd

import matplotlib.pyplot as plt

from PIL import Image

from urllib import request

import requests

from bs4 import BeautifulSoup as bs

from wordcloud import WordCloud,ImageColorGenerator

import matplotlib

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

# 分析网页函数

def getNowPlayingMovie_list():

url = 'https://movie.douban.com/nowplaying/shanghai/'

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}

resp = requests.get(url=url, headers=headers)

# resp = request.urlopen('https://movie.douban.com/nowplaying/shanghai/')

# html_data = resp.read().decode('utf-8')

soup = bs(resp.text, 'html.parser')

nowplaying_movie = soup.find_all('div', id='nowplaying')

nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

nowplaying_list = []

for item in nowplaying_movie_list:

nowplaying_dict = {}

nowplaying_dict['id'] = item['data-subject']

for tag_img_item in item.find_all('img'):

nowplaying_dict['name'] = tag_img_item['alt']

nowplaying_list.append(nowplaying_dict)

return nowplaying_list

# 爬取评论函数

def getCommentsById(movieId, pageNum):

eachCommentList = []

if pageNum > 0:

start = (pageNum - 1) * 20

else:

return False

requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'

print(requrl)

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}

# headers={

# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}

resp = requests.get(url=requrl, headers=headers)

# resp = request.urlopen(requrl)

# html_data = resp.read().decode('utf-8')

#print(html_data)

soup = bs(resp.text, 'html.parser')

comment_div_lits = soup.find_all('div', class_='comment')

#print(comment_div_lits)

for item in comment_div_lits:

#print(item)

# print(item.find_all('p')[0])

#print(type(item.find_all('p')[0]))

if item.find_all('p')[0] is not None:

eachCommentList.append(item.find_all('p')[0])

return eachCommentList

def main():

# 循环获取第一个电影的前10页评论

commentList = []

NowPlayingMovie_list = getNowPlayingMovie_list()

print(NowPlayingMovie_list)

for i in range(10):

num = i + 1

commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num)

# commentList_temp = getCommentsById('19899707', num)

# print(commentList_temp)

commentList.append(commentList_temp)

#print(commentList)

# 将列表中的数据转换为字符串

comments = ''

for k in range(len(commentList)):

comments = comments + (str(commentList[k])).strip()

print(comments)

# 使用正则表达式去除标点符号

pattern = re.compile(r'[\u4e00-\u9fa5]+')

filterdata = re.findall(pattern, comments)

cleaned_comments = ''.join(filterdata)

# print(cleaned_comments)

# 使用结巴分词进行中文分词

segment = jieba.lcut(cleaned_comments)

words_df = pd.DataFrame({'segment': segment})

# 去掉停用词

stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],encoding='gbk')

# print(stopwords)

words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

print(words_df)

# 统计词频

words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})

words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)

# print(words_stat.head())

# 准备好一张白底图片

bg_pic = numpy.array(Image.open("love.jpg"))

# 用词云进行显示

wordcloud = WordCloud(

font_path="simhei.ttf",

background_color="white",

max_font_size=80,

width=2000,

height=1800,

mask=bg_pic,

mode="RGBA"

)

word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}

print(word_frequence)

"""

word_frequence_list = []

for key in word_frequence:

temp = (key, word_frequence[key])

word_frequence_list.append(temp)

#print(word_frequence_list)

"""

wordcloud = wordcloud.fit_words(word_frequence)

image_colors = ImageColorGenerator(bg_pic)

plt.imshow(wordcloud)

plt.axis("off")

plt.show()

wordcloud.to_file('show.jpg')

main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值