少年的你电影评论爬取和分析实战

导包

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import requests
from lxml import etree

爬取数据

# 首页   https://movie.douban.com/subject/30166972/comments?start=0&limit=20&sort=new_score&status=P
# 第二页 https://movie.douban.com/subject/30166972/comments?start=20&limit=20&sort=new_score&status=P
url = 'https://movie.douban.com/subject/30166972/comments?start=%d'
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'ll="118110"; bid=WOtRzZzB5n0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1582074463%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1948976768.1582074463.1582074463.1582074463.1; __utmc=30149280; __utmz=30149280.1582074463.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1954541914.1582074463.1582074463.1582074463.1; __utmb=223695111.0.10.1582074463; __utmc=223695111; __utmz=223695111.1582074463.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __yadk_uid=fZlVZoEw3nfJ2LTBimMXwbQ3rZSTg4gt; __utmt=1; __utmb=30149280.1.10.1582074463; __gads=ID=c96f4a66287780c3:T=1582074474:S=ALNI_MZg6jl7uY5UK19V5rQcW3J0oTvm6Q; ct=y; _pk_id.100001.4cf6=7c86c6587c0eeadf.1582074463.1.1582074503.1582074463.',
'Host':'movie.douban.com',
'Referer':'https://movie.douban.com/subject/30413052/reviews?start=20',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
fp = open('./climb_data1.csv',mode='w',encoding='utf-8')
fp.write('author\tcomment\tvote\n')
#由于未登陆豆瓣,只能爬取10页数据
for i in range(11):
    url_climb = url%(i*20)
    r = requests.get(url_climb,headers=headers)
    r.encoding = 'utf-8'
    text = r.text
    #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正
    html = etree.HTML(text)
    comments = html.xpath('//div[@id="comments"]/div[@class="comment-item"]')
    #strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
    for comment in comments:
        #作者
        author = comment.xpath('./div[@class="avatar"]/a/@title')[0].strip()
        #评论
        p = comment.xpath('.//span[@class="short"]/text()')[0].replace("\n","").strip()
        #赞同
        vote = comment.xpath('.//span[@class="votes"]/text()')[0].strip()
        fp.write('%s\t%s\t%s\n'%(author,p,vote))
    print('第%d页保存成功'%(i+1))
    #休眠1秒,防止反扒机制
    time.sleep(1)
fp.close()

读取数据

data = pd.read_csv('./climb_data1.csv',sep='\t')
data

存入数据库

from sqlalchemy import create_engine
#配置数据库链接字符串
conn = "mysql+pymysql://root:@127.0.0.1:3306/movie_data?charset=utf8"
data.to_sql('young_you',conn,index=True,if_exists='append')

SnowNLP进行情感分析

from snownlp import SnowNLP
#该方法的作用就是将评论进行情感分析
def convert(comment):
    snow = SnowNLP(str(comment))
    sentiments = snow.sentiments  #0(消极)~1(积极)
    return sentiments
##DataFrame的apply()方法默认作用于DataFrame的各列.
data['情感评分'] = data.comment.apply(convert)
data.sort_values(by = '情感评分',ascending=False,inplace=True)
data.to_csv('./young_you.csv')

jieba分词进行数据可视化

import jieba
from PIL import Image
from jieba import analyse
from wordcloud import wordcloud
comments = ';'.join([str(c) for c in data['comment'].tolist()])
#使用jiebe进行分词
gen = jieba.cut(comments)
words = ' '.join(gen)
#分好的词,进行jieba分析
tags = analyse.extract_tags(words,topK=500,withWeight=True)
word_result = pd.DataFrame(tags,columns=['词语','重要性'])
word_result.sort_values(by='重要性',ascending=False,inplace=True)
#数据可视化
plt.barh(y = np.arange(0, 10),width =word_result[:10]['重要性'][::-1])
plt.ylabel('Importance')
#print(word_result[:20]['词语'][::-1])
plt.yticks(np.arange(0,10),word_result[:10]['词语'][::-1], fontproperties ='KaiTi')
plt.show()

在这里插入图片描述

你知道的越多,你不知道的越多。
有道无术,术尚可求,有术无道,止于术。
如有其它问题,欢迎大家留言,我们一起讨论,一起学习,一起进步

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值