此文章仅用于笔者交流学习爬虫技术,如有侵权请联系作者删除
通过fiddler找到评论的url
fiddler是一个抓包软件,这里用的fiddler4.
fiddler4官网
配置好fiddler后,在腾讯视频页面点击查看更多
返回fiddler发现捕获了数据包,
找到评论数据包,右键复制url就得到了评论的url
这里选择的是《权力的游戏第八季》
eg:
https://video.coral.qq.com/filmreviewr/c/upcomment/nilk5fd4bkqdk3a? callback=_filmreviewrcupcommentnilk5fd4bkqdk3a &reqnum=3&source=132&commentid=6523204565596297127&_=1571376131656
https://video.coral.qq.com/filmreviewr/c/upcomment/nilk5fd4bkqdk3a? callback=_filmreviewrcupcommentnilk5fd4bkqdk3a &reqnum=3&source=132&commentid=6534034501152865577&_=1571376131657
第一页的深度评论:
https://video.coral.qq.com/filmreviewr/c/upcomment/nilk5fd4bkqdk3a?callback=filmreviewrcupcommentnilk5fd4bkqdk3a&reqnum=3&source=132&=1571378209126
第二页的深度评论:
https://video.coral.qq.com/filmreviewr/c/upcomment/nilk5fd4bkqdk3a?callback=filmreviewrcupcommentnilk5fd4bkqdk3a&reqnum=3&source=132&commentid=6529339639861532449&=1571378209129
第三页:
https://video.coral.qq.com/filmreviewr/c/upcomment/nilk5fd4bkqdk3a?callback=filmreviewrcupcommentnilk5fd4bkqdk3a&reqnum=3&source=132&commentid=6529032690464222964&=1571378209130
分析url找到加载评论信息的规律
通过老师提示和尝试发现:
·callback、source等参数并不重要
·决定加载对应评论的视频的参数vid
·对加载评论实现“翻页”的必要参数是commentid=
·决定单次加载个数的参数是reqnum=
最后构建的url格式:
https://video.coral.qq.com/filmreviewr/c/upcomment/ (视频ID) ?&reqnum=3&source=132&commentid= (评论ID) &_=1571376131656
打开构建的url发现在评论的json文件中有first和last字段,其中前一页评论的last字段刚好是后一页评论url中的commentid参数的值
可以根据这个实现翻页爬取评论
爬取深度评论代码(不包含子评论)
# encoding = "utf-8"
# Author = Joker-Face
# Date = 2019年10月18日
import urllib
import re
# 构造请求头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0',
'Content-Type':'application/javascript',}
opener = urllib.request.build_opener() # 实例化opener对象
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener) # 将opener安装为全局
vedioId = 'nilk5fd4bkqdk3a' # 声明影片id
commentId = '6529339639861532449' # 声明评论id
#reqnum = 3 # 单次获取评论个数
# 我这里构建的url并没有设置reqnum参数可变,采用了默认值3
loadPath = "C://spider//DeepCommentOfquanyou.txt"
with open(loadPath, "w", encoding="utf-8") as f:
for thisPage in range(10):
try:
print("当前第" + str(thisPage) + "页")
thisurl = 'https://video.coral.qq.com/filmreviewr/c/upcomment/' + vedioId + '?&reqnum=3&source=132&commentid=' + commentId + '&_=1571376131656'
data = urllib.request.urlopen(thisurl, timeout=3).read().decode("utf-8", "ignore")
titlePat = '"title":"(.*?)"'
contentPat = '"content":"(.*?)"'
commentIdPat = '"last":"(.*?)"'
allTitle = re.compile(titlePat, re.S).findall(data)
allContent = re.compile(contentPat, re.S).findall(data)
commentId = re.compile(commentIdPat, re.S).findall(data)[0]
for index in range(len(allTitle)):
print("第" + str(index) + "条深度评论标题:\n" + eval('u"' + allTitle[index] + '"') + "\n")
print("第" + str(index) + "条深度评论内容:\n" + eval('u"' + allContent[index] + '"') + "\n")
print("----------------------------------------------------------------\n")
f.write("第" + str(index) + "条深度评论标题:\n" + eval('u"' + allTitle[index] + '"') + "\n")
f.write("第" + str(index) + "条深度评论内容:\n" + eval('u"' + allContent[index] + '"') + "\n")
f.write("--------------------------------------------------------------\n")
except Exception as err:
print(err)
爬取短评(不包含子评论)
url的分析过程和之前类似
规律如下:
·所选影片的id=
·评论cursor=
·单次加载个数orinum=
构建url
url = “https://video.coral.qq.com/varticle/” + varticleId + “/comment/v2?callback=varticle3840581279commentv2&orinum=10&oriorder=o&pageflag=1&cursor= + cursor + "&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&=1571382463064”
爬取短评代码(不包含子评论)
此代码接上边爬取精评,请放入同一个.py文件运行
varticleId = '3840581279'
cursor = '6523373185439400099'
loadPath = "C://spider//ShortCmtOfquanyou.txt"
with open(loadPath, 'w', encoding="utf-8") as f:
count = 1
for thisPage in range(10):
try:
thisurl = 'https://video.coral.qq.com/varticle/' + varticleId + '/comment/v2?&orinum=10&oriorder=o&pageflag=1&cursor=' + cursor
data = urllib.request.urlopen(thisurl, timeout=3).read().decode('utf-8', "ignore")
print(thisurl)
contentPat = '"content":"(.*?)"'
cursorPat = '"last":"(.*?)"'
allContent = re.compile(contentPat, re.S).findall(data)
cursor = re.compile(cursorPat, re.S).findall(data)[0]
for index in range(len(allContent)):
print("第" + str(count) + "评论:\n")
print(eval('u"' + allContent[index] + '"'))
print("=====================================================================================")
f.write("第" + str(count) + "评论:\n")
f.write(eval('u"' + allContent[index] + '"'))
f.write("\n=====================================================================================\n")
count = count + 1
except Exception as err:
print(err)
print("共计录入" + str(count) + "短评")
面向对象式编程实现爬虫
class spider:
def __init__(self, vedioId, varticalId, commentId, cursor):
self.vedioId = vedioId
self.varticalId = varticalId
self.commentId = commentId
self.cursor = cursor
# 构造请求头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0',
'Content-Type':'application/javascript',}
opener = urllib.request.build_opener() # 实例化opener对象
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener) # 将opener安装为全局
def getDeepComment(self, localPath):
vid = self.vedioId
cid = self.varticalId
loadPath = localPath
with open(loadPath, "w", encoding="utf-8") as f:
for thisPage in range(10):
try:
print("当前第" + str(thisPage) + "页")
thisurl = 'https://video.coral.qq.com/filmreviewr/c/upcomment/' + vid + '?&reqnum=3&source=132&commentid=' + cid + '&_=1571376131656'
data = urllib.request.urlopen(thisurl, timeout=3).read().decode("utf-8", "ignore")
titlePat = '"title":"(.*?)"'
contentPat = '"content":"(.*?)"'
commentIdPat = '"last":"(.*?)"'
allTitle = re.compile(titlePat, re.S).findall(data)
allContent = re.compile(contentPat, re.S).findall(data)
commentId = re.compile(commentIdPat, re.S).findall(data)[0]
for index in range(len(allTitle)):
print("第" + str(index) + "条深度评论标题:\n" + eval('u"' + allTitle[index] + '"') + "\n")
print("第" + str(index) + "条深度评论内容:\n" + eval('u"' + allContent[index] + '"') + "\n")
print("----------------------------------------------------------------\n")
f.write("第" + str(index) + "条深度评论标题:\n" + eval('u"' + allTitle[index] + '"') + "\n")
f.write("第" + str(index) + "条深度评论内容:\n" + eval('u"' + allContent[index] + '"') + "\n")
f.write("--------------------------------------------------------------\n")
except Exception as err:
print(err)
def getShortCmt(self, localPath):
vid = self.commentId
cid = self.cursor
loadPath = "C://spider//ShortCmtOfquanyou.txt"
with open(loadPath, 'w', encoding="utf-8") as f:
count = 1
for thisPage in range(10):
try:
thisurl = 'https://video.coral.qq.com/varticle/' + vid + '/comment/v2?&orinum=10&oriorder=o&pageflag=1&cursor=' + cid
data = urllib.request.urlopen(thisurl, timeout=3).read().decode('utf-8', "ignore")
print(thisurl)
contentPat = '"content":"(.*?)"'
cursorPat = '"last":"(.*?)"'
allContent = re.compile(contentPat, re.S).findall(data)
cursor = re.compile(cursorPat, re.S).findall(data)[0]
for index in range(len(allContent)):
print("第" + str(count) + "评论:\n")
print(eval('u"' + allContent[index] + '"'))
print("=====================================================================================")
f.write("第" + str(count) + "评论:\n")
f.write(eval('u"' + allContent[index] + '"'))
f.write("\n=====================================================================================\n")
count = count + 1
except Exception as err:
print(err)
print("共计录入" + str(count) + "短评")
if __name__ == '__main__':
TencentSpider = spider('nilk5fd4bkqdk3a', '6529339639861532449', '3840581279', '6523373185439400099')
TencentSpider.getDeepComment("C://spider//1.txt")
运行结果: