爬取电视剧评论信息

爬取电视剧的评论信息,并存储到本地。

# coding:utf-8
import urllib.request
import re
import http.cookiejar
import json
import time

cursor_id="6374452475944677749"
url="https://video.coral.qq.com/varticle/2451377986/comment/v2?callback=_varticle2451377986commentv2&orinum=10&oriorder=o&pageflag=1&cursor="
temp_headers = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Encoding":"utf-8","Accept-Language":"zh-CN,zh;q=0.8","User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
cjar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
headers = []
for key,value in temp_headers.items():
    item = (key,value)
    headers.append(item)
opener.addheaders=headers
urllib.request.install_opener(opener)


def get_data(cursor_id):
    url_new = url+cursor_id
    # 获得的原始数据,为json格式
    data = urllib.request.urlopen(url_new).read().decode("utf-8")
    #对获得的原始数据进行格式化处理,转换为标准的json格式
    temp_data = data.replace('_varticle2451377986commentv2(','').replace(')',"")
    #将接送格式的数据转化为python字典格式
    temp = json.loads(temp_data,encoding="utf-8")
    # print(temp)
    # print(temp['data']['userList'])
    # print(len(temp['data']['userList']))
    # print(type(temp))
    # # print(len(temp))
    # # for i in temp:
    # #     print(i)
    # for i in temp['data']:
    #     print(i)
    #提取需要的数据
    id_first = temp['data']['first']    #第一个id
    id_last = temp['data']['last']    #最后一个id

    #每一个评论的数据列表
    new_comment_data_list=[]
    data_list = temp['data']['oriCommList']
    for data in data_list:
        # print(data)
        comment_data = data['content']
        new_comment_data_list.append(comment_data)
    print(new_comment_data_list)


    # 获取评论者姓名
    nick_name_list = temp['data']['userList']
    # print(nick_name_list)
    new_nick_name_list = []
    for k,v in nick_name_list.items():
        nick_name = v['nick']
        new_nick_name_list.append(nick_name)
    print(new_nick_name_list)
    print("++++++++++++++++++++++++++++++++++++++++++++")
    print(id_last)

    for i in range(len(new_comment_data_list)):
        with open('comment.txt','a+',encoding="utf-8") as fr:
            fr.write(new_comment_data_list[i])
            fr.write('\n')
    return id_last
# 获取数据
while True:
    try:
        cursor_id = get_data(cursor_id)
        time.sleep(1)
    except:
        print("信息有误")

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值