应用技术:浏览器伪装,抓包分析,正则提取
import urllib.request
import re
import random
#https://video.coral.qq.com/varticle/[视频id] /comment/v2?callback=_varticle4563082514commentv2&orinum=[评论条数]&oriorder=o&pageflag=1&cursor=[评论起始]
vid="4563082514"
cid="6614739038739398410"
num="10"
#url="https://video.coral.qq.com/varticle/"+vid+" /comment/v2?callback=_varticle"+vid+"commentv2&orinum="+num+"&oriorder=o&pageflag=1&cursor="+cid
url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?callback=_varticle"+vid+"commentv2&orinum="+num+"&oriorder=o&pageflag=1&cursor="+cid
urlpoors=[
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
]
def ua(urlpools):
thisua=random.choice(urlpoors)
print(thisua)
#浏览器伪装
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
commentpat='"content":(.*?)"up"'
for j in range(0,5):
print("第"+str(j)+"页")
ua(urlpoors)
url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?callback=_varticle"+vid+"commentv2&orinum="+num+"&oriorder=o&pageflag=1&cursor="+cid
data=urllib.request.urlopen(url).read().decode("utf-8")
commentall=re.compile(commentpat).findall(data)
lastpat='"last":"(.*?)"'
cid=re.compile(lastpat,re.S).findall(data)[0]
print(cid)
for i in range(0,len(commentall)):
try:
thiscomment=commentall[i]
print(thiscomment)
except Exception as err:
print(err)
这里的代码和之前相比并未明显差别,但实际操作过程中需要先用fiddler做代理服务器进行抓包分析,分析出评论隐藏的js文件,从而提取url地址
提取过程中为避免其他网络活动干扰,可以灵活运用clear进行清楚
关于fiddler函数的配置问题可参见下方博客