腾讯视频评论爬虫实战

腾讯视频评论爬虫实战(深度长评)

  • 程序文件:xajh_cp.py 结果保存:./tencent_comment/xajh_cp.txt

  • 深度长评的URL地址格式为:https://video.coral.qq.com/filmreviewr/c/upcomment/[视频id]?&reqnum=3&commentid=[评论id]

  • fildder

  • requests-re

  • 用户代理

  • 《新笑傲江湖》DVD版评论

  • 抓包经过简化得到的url: https://video.coral.qq.com/filmreviewr/c/upcomment/4baf2nzoljqyobl?&reqnum=3&commentid=0

  • xajh_cp.py

import requests
import re
import random

def get_html(url,params):
    uapools=[
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
             'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'         
            ]
    
    thisua=random.choice(uapools)
    headers={"User-Agent":thisua}
    r=requests.get(url,headers=headers,params=params)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    return r.text
    

def parse_page(infolist,data):
    titlepat= '"title":"(.*?)"'
    commentpat='"abstract":"(.*?)"'
    lastpat='"last":"(.*?)"'
    
    titleall=re.compile(titlepat,re.S).findall(data)
    commentall=re.compile(commentpat,re.S).findall(data)
    next_cid=re.compile(lastpat).findall(data)[0]
    
    infolist.append([titleall[:len(commentall)],commentall])
    
    return next_cid

def print_comment_list(infolist):
    j=0
    for page in infolist:
        print('第'+str(j+1)+'页\n')
        titleall=page[0]
        commentall=page[1]
        for i in range(0,len(commentall)):
            print('='*30)
            print('评论标题:'+eval('u'+"'"+titleall[i]+"'")+'\n')
            print('评论内容:'+eval('u'+"'"+commentall[i]+"'")+'\n')
        j+=1
    
    
def save_to_txt(infolist,path):
    fw=open(path,'w+',encoding='utf-8')
    j=0
    for page in infolist:
        fw.write('第'+str(j+1)+'页\n')
        titleall=page[0]
        commentall=page[1]
        for i in range(0,len(commentall)):
            fw.write('='*30+'\n')
            fw.write('评论标题:'+eval('u'+"'"+titleall[i]+"'")+'\n')
            fw.write('评论内容是:'+eval('u'+"'"+commentall[i]+"'")+'\n')
        j+=1
    fw.close()
def main():
    infolist=[]       #page_nnum x 2(titleall,commentall) x req_num
    vid= '4baf2nzoljqyobl';  next_cid='0'; page_num=2
    
    for i in range(page_num):
        url='https://video.coral.qq.com/filmreviewr/c/upcomment/'+vid+'?'
        params={'commentid': next_cid,'reqnum': '3'}
        html=get_html(url,params)
        next_cid=parse_page(infolist,html)
    
    print_comment_list(infolist)
    save_to_txt(infolist,'./tencent_comment/xajh_cp.txt')
        
main()

腾讯视频评论爬虫实战(短评)

  • 程序文件:xajh_dp.py 结果保存:./tencent_comment/xajh_dp.txt

  • 全部短评评论的URL地址格式为:https://video.coral.qq.com/varticle/[视频编号]/comment/v2?&orinum=[返回评论个数]&cursor=[评论标号]"

  • fildder

  • requests-re

  • 用户代理

  • 《新笑傲江湖》DVD版评论

  • 抓包经过简化得到的url: https://video.coral.qq.com/varticle/1001103527/comment/v2?&orinum=12&cursor=0

  • xajh_cp.py


import requests
import re
import random

def get_html(url,params):
    uapools=[
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
             'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'         
            ]
    
    thisua=random.choice(uapools)
    headers={"User-Agent":thisua}
    r=requests.get(url,headers=headers,params=params)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    r.encoding='utf-8'       # 不加此句出现乱码
    return r.text
    

def parse_page(infolist,data):
    commentpat='"content":"(.*?)"'
    lastpat='"last":"(.*?)"'
    
    commentall=re.compile(commentpat,re.S).findall(data)
    next_cid=re.compile(lastpat).findall(data)[0]
    
    infolist.append(commentall)
    
    return next_cid

def print_comment_list(infolist):
    j=0
    for page in infolist:
        print('第'+str(j+1)+'页\n')
        commentall=page
        for i in range(0,len(commentall)):
            print('评论内容:'+commentall[i]+'\n')
        j+=1
    
    
def save_to_txt(infolist,path):
    fw=open(path,'w+',encoding='utf-8')
    j=0
    for page in infolist:
        fw.write('第'+str(j+1)+'页\n')
        commentall=page
        for i in range(0,len(commentall)):
            fw.write('评论内容:'+commentall[i]+'\n')
        j+=1
    fw.close()
    
def main():
    infolist=[]
    vid='1001103527';  cid = "0"; page_num=2
    url = 'https://video.coral.qq.com/varticle/'+vid+'/comment/v2'
    
    for i in range(page_num):
        params={'orinum':'10','cursor':cid}
        html=get_html(url,params)
        cid=parse_page(infolist,html)
    
    print_comment_list(infolist)
    save_to_txt(infolist,'./tencent_comment/xajh_dp.txt')
    
main()               
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值