python影评_python爬虫及结巴分词《攀登者》影评分析

《攀登者》影评爬取及分析

0、项目结构

其中simkai.ttf为字体文件,Windows查看系统自带的字体

C:\Windows\Fonts

一、爬取豆瓣影评数据

# -*- coding: utf-8 -*-

"""爬取豆瓣影评"""

import requests

from lxml import etree

import time

url = "https://movie.douban.com/subject/30413052/comments?start=%d&limit=20&sort=new_score&status=P"

#请求头

headers = {'Host': 'movie.douban.com',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

#'Accept-Encoding': 'gzip, deflate, br',

'Connection': 'keep-alive',

'Cookie': 'bid=TXwfIvNFTRE; douban-fav-remind=1; __gads=ID=e042951d078c30b3:T=1570518321:S=ALNI_Mbp-ZmoryuBFEnTQy24mwdf0B89ig; __utma=30149280.1448315194.1570518324.1570518324.1572927825.2; __utmz=30149280.1570518324.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=589509e524ead00f.1572927824.1.1572927824.1572927824.; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1572927825; __utmc=30149280; __utma=223695111.1094105223.1572927825.1572927825.1572927825.1; __utmb=223695111.0.10.1572927825; __utmc=223695111; __utmz=223695111.1572927825.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0',

'Upgrade-Insecure-Requests': '1',

'Cache-Control': 'max-age=0'}

if __name__ == '__main__':

f = open("./climb.csv", mode="w", encoding='utf-8')

f.write("author\tcomment\tvotes\n")

#start:0,20,40,...,200

for i in range(11):#range左闭右开

#1拼接url,只能获取前11页数据

if i == 10:#最后一页

url_climb = url%(200)

else:

url_climb = url%(i*20)

#2发起请求,设置编码,获取文本内容

response = requests.get(url_climb, headers = headers)

response.encoding = "utf-8"

text = response.text

#存储

#with open("./climb.html", mode="w", encoding="utf-8") as f:

# f.write(text)

#使用etree解析

html = etree.HTML(text)

comments = html.xpath('//div[@id="comments"]/div[@class="comment-item"]')

for comment in comments:

#获取评论人

author = comment.xpath('./div[@class="avatar"]/a/@title')[0].strip()

#获取评论内容

p = comment.xpath('.//span[@class="short"]/text()')[0].strip()

#获取这条评论对应的点赞数

vote = comment.xpath('.//span[@class="votes"]/text()')[0].strip()

#print(author, p, vote)

f.write("%s\t%s\t%s\n" % (author,p,vote))

#打印提示信息,并休眠一秒,反爬虫

print("第%d页的数据保存成功" % (i+1))

time.sleep(1)

f.close()

二、对评论信息进行情感分析

# -*- coding: utf-8 -*-

"""

pandas:python data analysis lib,返回值为DataFrame(行,列),行是样本,列为属性

"""

import pandas as pd

from snownlp import SnowNLP

# 显示所有列

pd.set_option('display.max_columns', None)

def convert(comment):

"""将传入的评论进行情感分析"""

snow = SnowNLP(str(comment))

sentiments = snow.sentiments#0(消极评论)-1(积极评论)

return sentiments

if __name__ =='__main__':

data = pd.read_csv('./climb.csv', '\t')

#print(data.head(), "\n", data.shape)

#获取评论数据,进行情感分析,DataFrame就会新增加一列名为‘情感评分’的数据

data['情感评分'] = data.comment.apply(convert)

data.sort_values(by='情感评分', ascending=False, inplace=True)

#保存数据

data.to_csv('./climb_snownlp.csv', sep='\t', index=False, encoding='utf-8')

print(data[:5])

print(data[-5:])

三、对评论数据进行jieba分词,生成关键词条形图和词云

# -*- coding: utf-8 -*-

import pandas as pd

import jieba

from jieba import analyse

import matplotlib.pyplot as plt

import numpy as np

import wordcloud

from PIL import Image

if __name__ == '__main__':

data = pd.read_csv('./climb.csv', sep='\t')

#列表生成式,获取所有评论信息

comments = ';'.join([str(c) for c in data['comment'].tolist()])

#print(comments)

#使用jieba库对文本进行分词,返回的是生成器

gen_ret = jieba.cut(comments)

seg_words = '/'.join(gen_ret)

#print(seg_words)

#对分好的词进行分析,topK返回的关键词个数,withWeight带着权重

tags_ret = analyse.extract_tags(seg_words, topK=500, withWeight=True)

#print(tags_ret)

#将数据转换成DataFrame

df_ret = pd.DataFrame(tags_ret, columns=['词语', '重要性'])

df_ret.sort_values(by='重要性', ascending=False, inplace=True)#根据重要性降序排列

#print(df_ret)

#可视化,500个词语,选取前20个分析

plt.barh(y=np.arange(0,20), width=df_ret[:20]['重要性'][::-1])

plt.ylabel('Importance')

plt.yticks(np.arange(0,20), labels=df_ret[:20]['词语'][::-1], fontproperties='KaiTi')

#保存条形图!!!保存代码一定要写在show之前,dpi表示屏幕像素密度

plt.savefig('./条形图_20个keyword.jpg', dpi=200)

plt.show()

#词云操作

bg = np.array(Image.open('./bg.jpg'))#词云的图片

words = dict(tags_ret)#将标签转为词典

cloud = wordcloud.WordCloud(width=1200, height=968,

font_path='./simkai.ttf',#字体路径

background_color='white', mask=bg,

max_words=500, max_font_size=150)

#生成词云图片

word_cloud = cloud.generate_from_frequencies(words)

plt.figure(figsize=(12,12))

plt.imshow(word_cloud)

#词云保存

plt.savefig('./攀登者词云.jpg', dpi=200)

plt.show()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值