爬取腾讯视频评论——以《三生三世，十里桃花》为例

最新推荐文章于 2021-04-21 17:57:04 发布

谨守

最新推荐文章于 2021-04-21 17:57:04 发布

阅读量487

点赞数

分类专栏： Python 文章标签： python json excel

本文链接：https://blog.csdn.net/weixin_44091801/article/details/103749328

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

#@kaiyiching
import requests
import re
import json
import io
import sys
import datetime
import time,random
import csv
import openpyxl 

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')

#每一集视频的ID地址
vids = [ 
'1743283224','1743283291','1744194068','1744194109','1745133738','1745133746','1746125178','1746125211',  '1747460419','1747460409','1748886128','1750117919','1750117958','1751794201','1751794196','1753505550',  '1753505443','1755270031','1755269984','1757037177','1757037063','1758825127','1758825036','1760156472',   '1761387060','1761387007','1763172513','1763172469','1764949073','1764949060','1766727104','1766727112',  '1768551535','1768551548','1770382580','1770382609','1771753979','1773023416','1773023418','1774863174',   '1774863184','1776738550','1776738647','1778383113','1778383305','1780599951','1780599966','1782554392',   '1782554400','1783960585','1785270823','1785270863','1787131235','1787131345','1789058581','1789058581',
'1791013007','1791013125'
]

#评论的起始ID
start_commids = [
['6616644405906130203'],['6616664859394932771'],['6616672316945937236'],['6615962119770492170'],
['6616638773667540360'],['6616672659104676103'],['6616611398839191565'],['6616638804599519528'],
['6616641285764871460'],['6615805231621550508'],['6616632515422000843'],['6614549204063199940'],
['6616653318569078954'],['6616508559151056021'],['6616438746806460645'],['6614892737450189021'],
['6616541726632325456'],['6616491007411730598'],['6616482905998308867'],['6616650046032966419'],
['6616380103377493114'],['6616514002842793872'],['6616666724605826041'],['6615947305712546108'],
['6616673826644673892'],['6616668964007379730'],['6616562399103268939'],['6614501476259102517'],
['6615580397840873278'],['6616560843789979554'],['6615501928859976367'],['6616595094387017719'],
['6615857216740328480'],['6616563955453735997'],['6614396586108066634'],['6615562549724787697'],
['6615797262117749567'],['6616611457506818250'],['6616678069939487169'],['6616600135378079490'],
['6616655145532062117'],['6616559226108496403'],['6616670371548997639'],['6615186649276466275'],
['6616649222480168935'],['6616663931755886339'],['6616536528585866717'],['6616645511367176703'],
['6616681357352563095'],['6616675089028151111'],['6616631206095599091'],['6616282701895279731'],
['6616680859552956805'],['6616345117194691322'],['6616324008295513548'],['6616324008295513548'],
['6616670658119611261'],['6616600879377658275']
]

infos = []

for k in range(5):   
# k表示集数，为了不给服务器太大压力，所以只设置了爬取5集的部分评论
    vid = vids[k]
    commids = start_commids[k]

    for i in range(10):
    # 这里的10表示每一集爬取的评论数量
        commid = commids[i]
        vurl='https://video.coral.qq.com/varticle/1743283224/comment/v2?callback=_varticle'+vid+'commentv2&orinum=10&oriorder=t&pageflag=1&cursor='+str(commid)+'&scorecursor=&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1577498740192'
        # 获取url的内容
        res_vurl = requests.get(vurl)
        # url内容为字节，故需要转换成str，因为re模块作用于str
        res_vurl_1 = bytes.decode(res_vurl.content)
        # 进行re.sub进行替换无用的字符串，以及首尾两个括号，使格式变为json
        dic_data = re.sub('_varticle'+str(vid)+'commentv2|\(|\)','',res_vurl_1)
        # 进行str转字节
        data_bytes = str.encode(dic_data)

        # 读取data内容
        data = json.loads(data_bytes)['data']
        last_commid = data['last']
        print(last_commid)
        commids.append(last_commid)

        comments  = data['oriCommList']

        for comment in comments:
            time = comment['time']
            # 将时间戳转化成日期
            timeStamp = int(time)
            dateArray = datetime.datetime.utcfromtimestamp(timeStamp)
            ep = k+1
            date = dateArray.strftime("%Y-%m-%d") 
            content = comment['content']
            infos.append([ep,date,content])


#将爬取的内容写入Excel表格：
wb = openpyxl.Workbook() 
sheet = wb.active
sheet.title = 'tencent-comment'
sheet['A1'] = '三生三世，十里桃花'
for info in infos:
    sheet.append(info)
# 存储表格
wb.save('Tencent-comment.xlsx')

至于怎么批量获取视频的ID和评论的ID我之后再想想办法。

谨守

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬取腾讯视频评论——以《三生三世，十里桃花》为例

#@kaiyichingimport requestsimport reimport jsonimport ioimport sysimport datetimeimport time,randomimport csvimport openpyxl sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8'...
复制链接

扫一扫

专栏目录