#@kaiyiching
import requests
import re
import json
import io
import sys
import datetime
import time,random
import csv
import openpyxl
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
#每一集视频的ID地址
vids = [
'1743283224','1743283291','1744194068','1744194109','1745133738','1745133746','1746125178','1746125211', '1747460419','1747460409','1748886128','1750117919','1750117958','1751794201','1751794196','1753505550', '1753505443','1755270031','1755269984','1757037177','1757037063','1758825127','1758825036','1760156472', '1761387060','1761387007','1763172513','1763172469','1764949073','1764949060','1766727104','1766727112', '1768551535','1768551548','1770382580','1770382609','1771753979','1773023416','1773023418','1774863174', '1774863184','1776738550','1776738647','1778383113','1778383305','1780599951','1780599966','1782554392', '1782554400','1783960585','1785270823','1785270863','1787131235','1787131345','1789058581','1789058581',
'1791013007','1791013125'
]
#评论的起始ID
start_commids = [
['6616644405906130203'],['6616664859394932771'],['6616672316945937236'],['6615962119770492170'],
['6616638773667540360'],['6616672659104676103'],['6616611398839191565'],['6616638804599519528'],
['6616641285764871460'],['6615805231621550508'],['6616632515422000843'],['6614549204063199940'],
['6616653318569078954'],['6616508559151056021'],['6616438746806460645'],['6614892737450189021'],
['6616541726632325456'],['6616491007411730598'],['6616482905998308867'],['6616650046032966419'],
['6616380103377493114'],['6616514002842793872'],['6616666724605826041'],['6615947305712546108'],
['6616673826644673892'],['6616668964007379730'],['6616562399103268939'],['6614501476259102517'],
['6615580397840873278'],['6616560843789979554'],['6615501928859976367'],['6616595094387017719'],
['6615857216740328480'],['6616563955453735997'],['6614396586108066634'],['6615562549724787697'],
['6615797262117749567'],['6616611457506818250'],['6616678069939487169'],['6616600135378079490'],
['6616655145532062117'],['6616559226108496403'],['6616670371548997639'],['6615186649276466275'],
['6616649222480168935'],['6616663931755886339'],['6616536528585866717'],['6616645511367176703'],
['6616681357352563095'],['6616675089028151111'],['6616631206095599091'],['6616282701895279731'],
['6616680859552956805'],['6616345117194691322'],['6616324008295513548'],['6616324008295513548'],
['6616670658119611261'],['6616600879377658275']
]
infos = []
for k in range(5):
# k表示集数,为了不给服务器太大压力,所以只设置了爬取5集的部分评论
vid = vids[k]
commids = start_commids[k]
for i in range(10):
# 这里的10表示每一集爬取的评论数量
commid = commids[i]
vurl='https://video.coral.qq.com/varticle/1743283224/comment/v2?callback=_varticle'+vid+'commentv2&orinum=10&oriorder=t&pageflag=1&cursor='+str(commid)+'&scorecursor=&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1577498740192'
# 获取url的内容
res_vurl = requests.get(vurl)
# url内容为字节,故需要转换成str,因为re模块作用于str
res_vurl_1 = bytes.decode(res_vurl.content)
# 进行re.sub进行替换无用的字符串,以及首尾两个括号,使格式变为json
dic_data = re.sub('_varticle'+str(vid)+'commentv2|\(|\)','',res_vurl_1)
# 进行str转字节
data_bytes = str.encode(dic_data)
# 读取data内容
data = json.loads(data_bytes)['data']
last_commid = data['last']
print(last_commid)
commids.append(last_commid)
comments = data['oriCommList']
for comment in comments:
time = comment['time']
# 将时间戳转化成日期
timeStamp = int(time)
dateArray = datetime.datetime.utcfromtimestamp(timeStamp)
ep = k+1
date = dateArray.strftime("%Y-%m-%d")
content = comment['content']
infos.append([ep,date,content])
#将爬取的内容写入Excel表格:
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = 'tencent-comment'
sheet['A1'] = '三生三世,十里桃花'
for info in infos:
sheet.append(info)
# 存储表格
wb.save('Tencent-comment.xlsx')
至于怎么批量获取视频的ID和评论的ID我之后再想想办法。