腾讯视频评论爬虫实战(深度长评)
-
程序文件:xajh_cp.py 结果保存:./tencent_comment/xajh_cp.txt
-
深度长评的URL地址格式为:https://video.coral.qq.com/filmreviewr/c/upcomment/[视频id]?&reqnum=3&commentid=[评论id]
-
fildder
-
requests-re
-
用户代理
-
《新笑傲江湖》DVD版评论
-
抓包经过简化得到的url: https://video.coral.qq.com/filmreviewr/c/upcomment/4baf2nzoljqyobl?&reqnum=3&commentid=0
-
xajh_cp.py
import requests
import re
import random
def get_html(url,params):
uapools=[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
]
thisua=random.choice(uapools)
headers={"User-Agent":thisua}
r=requests.get(url,headers=headers,params=params)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
def parse_page(infolist,data):
titlepat= '"title":"(.*?)"'
commentpat='"abstract":"(.*?)"'
lastpat='"last":"(.*?)"'
titleall=re.compile(titlepat,re.S).findall(data)
commentall=re.compile(commentpat,re.S).findall(data)
next_cid=re.compile(lastpat).findall(data)[0]
infolist.append([titleall[:len(commentall)],commentall])
return next_cid
def print_comment_list(infolist):
j=0
for page in infolist:
print('第'+str(j+1)+'页\n')
titleall=page[0]
commentall=page[1]
for i in range(0,len(commentall)):
print('='*30)
print('评论标题:'+eval('u'+"'"+titleall[i]+"'")+'\n')
print('评论内容:'+eval('u'+"'"+commentall[i]+"'")+'\n')
j+=1
def save_to_txt(infolist,path):
fw=open(path,'w+',encoding='utf-8')
j=0
for page in infolist:
fw.write('第'+str(j+1)+'页\n')
titleall=page[0]
commentall=page[1]
for i in range(0,len(commentall)):
fw.write('='*30+'\n')
fw.write('评论标题:'+eval('u'+"'"+titleall[i]+"'")+'\n')
fw.write('评论内容是:'+eval('u'+"'"+commentall[i]+"'")+'\n')
j+=1
fw.close()
def main():
infolist=[] #page_nnum x 2(titleall,commentall) x req_num
vid= '4baf2nzoljqyobl'; next_cid='0'; page_num=2
for i in range(page_num):
url='https://video.coral.qq.com/filmreviewr/c/upcomment/'+vid+'?'
params={'commentid': next_cid,'reqnum': '3'}
html=get_html(url,params)
next_cid=parse_page(infolist,html)
print_comment_list(infolist)
save_to_txt(infolist,'./tencent_comment/xajh_cp.txt')
main()
腾讯视频评论爬虫实战(短评)
-
程序文件:xajh_dp.py 结果保存:./tencent_comment/xajh_dp.txt
-
全部短评评论的URL地址格式为:https://video.coral.qq.com/varticle/[视频编号]/comment/v2?&orinum=[返回评论个数]&cursor=[评论标号]"
-
fildder
-
requests-re
-
用户代理
-
《新笑傲江湖》DVD版评论
-
抓包经过简化得到的url: https://video.coral.qq.com/varticle/1001103527/comment/v2?&orinum=12&cursor=0
-
xajh_cp.py
import requests
import re
import random
def get_html(url,params):
uapools=[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
]
thisua=random.choice(uapools)
headers={"User-Agent":thisua}
r=requests.get(url,headers=headers,params=params)
r.raise_for_status()
r.encoding=r.apparent_encoding
r.encoding='utf-8' # 不加此句出现乱码
return r.text
def parse_page(infolist,data):
commentpat='"content":"(.*?)"'
lastpat='"last":"(.*?)"'
commentall=re.compile(commentpat,re.S).findall(data)
next_cid=re.compile(lastpat).findall(data)[0]
infolist.append(commentall)
return next_cid
def print_comment_list(infolist):
j=0
for page in infolist:
print('第'+str(j+1)+'页\n')
commentall=page
for i in range(0,len(commentall)):
print('评论内容:'+commentall[i]+'\n')
j+=1
def save_to_txt(infolist,path):
fw=open(path,'w+',encoding='utf-8')
j=0
for page in infolist:
fw.write('第'+str(j+1)+'页\n')
commentall=page
for i in range(0,len(commentall)):
fw.write('评论内容:'+commentall[i]+'\n')
j+=1
fw.close()
def main():
infolist=[]
vid='1001103527'; cid = "0"; page_num=2
url = 'https://video.coral.qq.com/varticle/'+vid+'/comment/v2'
for i in range(page_num):
params={'orinum':'10','cursor':cid}
html=get_html(url,params)
cid=parse_page(infolist,html)
print_comment_list(infolist)
save_to_txt(infolist,'./tencent_comment/xajh_dp.txt')
main()