动态爬取腾讯视频中斗罗大陆1动漫的评论
1 明确爬取的网站
目标数据:斗罗大陆1短评信息
数据位置:非静态网页源码目标数据标签位置(js文件)
选取相应URL(网址)
https://coral.qq.com/article/7633748161/comment/v2?callback=_article7633748161commentv2&orinum=10&oriorder=o&pageflag=1&cursor=0&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=1&_=1642400175443
"page":"1642400175443"
"last":"6887938427630994833"
2 请求网页
通过python写的爬虫装成浏览器,向服务器发送请求,并获得响应响应就是该网页源代码,静态网页可以在网页中右击选择网页源代码看到,但是动态网页是封装在JS文件中的,需要找到相应的JS文件。
import random,re
from requests_html import HTMLSession, HTML, AsyncHTMLSession
class tengxunTest:
def __init__(self, url):
self.start_url = url
self.session = HTMLSession() # 实例化session
self.aSession = AsyncHTMLSession() # 实例化异步session
users = { # 可以在发送请求的时候更换user-agent
1: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
2: 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
3: 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
4: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
5: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
ur1 = random.sample(users.keys(), 1)
self.headers = "users" + str(ur1)
def get_response(self):
"""获取响应,并返回requests_html中的HTML对象"""
start_url = self.start_url
res = self.session.get(url=start_url, headers={'user-agent': self.headers})
print("网页状态", res)
res.html.render()
# print(res.html.html)
return res.html
3 数据解析
循环获取评论数据
循环更新网页
利用re进行模式匹配,找到相应的数据放入列表
利用循环将列表中的数据进行保存到txt文件中
#从源码中获取评论的数据
def get_comment_user(self,html):
pat = '"userid":"(.*?)"'
rst = re.compile(pat, re.S).findall(html)
return rst
def get_comment(self,html):
pat='"content":"(.*?)"'
rst = re.compile(pat,re.S).findall(html)
return rst
#从源码中获取下一轮刷新页的ID
def get_lastId(self,html):
pat='"last":"(.*?)"'
lastId = re.compile(pat,re.S).findall(html)[0]
return lastId
if __name__ == '__main__':
# 初始页面
page = 1642400175443
# 初始待刷新页面ID
lastId = "6887938427630994833"
for i in range(1, 11):
url = "https://coral.qq.com/article/7633748161/comment/v2?callback=_article7633748161commentv2&orinum=10&oriorder=o&pageflag=1&cursor=" + lastId + "&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=1&_=" + str(
page)
test = tengxunTest(url=url)
html = test.get_response().html
# 获取评论者id
commentid_list = test.get_comment_user(html)
# 获取评论数据
commentlist = test.get_comment(html)
print("------第" + str(i) + "轮页面评论------")
for j in range(1, len(commentlist)):
print("第" + str(j) + "条评论者id:" + str(commentid_list[j]))
print("第" + str(j) + "条评论:" + str(commentlist[j]))
for index, comment in enumerate(commentlist):
with open(r"comment_" + str(i) + "_" + str(index) + ".txt", "w", encoding='utf-8') as f:
f.write(str(comment))
# 获取下一轮刷新页ID
lastId = test.get_lastId(html)
page += 1
4 数据保存
利用“with open”方式将爬取到的数据存入txt文件中
for index, comment in enumerate(commentlist):
with open(r"comment_" + str(i) + "_" + str(index) + ".txt", "w", encoding='utf-8') as f:
f.write(str(comment))
5 总结
好了,今天的内容到这里了。我们动态获取腾讯视频斗罗大陆1的评论,这相对与前两篇较为偏难,需要大家花一些时间去理解。
这也是Requests-html框架的最后一个练手项目了,后面将把它放一放了,不懂得欢迎来访!
代码封装如下:
# -*- coding: utf-8 -*-
# @Author : KongDeXing
# @Time : 2022/1/17 14:11
import random,re
from requests_html import HTMLSession, HTML, AsyncHTMLSession
class tengxunTest:
def __init__(self, url):
self.start_url = url
self.session = HTMLSession() # 实例化session
self.aSession = AsyncHTMLSession() # 实例化异步session
users = { # 可以在发送请求的时候更换user-agent
1: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
2: 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
3: 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
4: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
5: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
ur1 = random.sample(users.keys(), 1)
self.headers = "users" + str(ur1)
def get_response(self):
"""获取响应,并返回requests_html中的HTML对象"""
start_url = self.start_url
res = self.session.get(url=start_url, headers={'user-agent': self.headers})
print("网页状态", res)
res.html.render()
# print(res.html.html)
return res.html
#从源码中获取评论的数据
def get_comment_user(self,html):
pat = '"userid":"(.*?)"'
rst = re.compile(pat, re.S).findall(html)
return rst
def get_comment(self,html):
pat='"content":"(.*?)"'
rst = re.compile(pat,re.S).findall(html)
return rst
#从源码中获取下一轮刷新页的ID
def get_lastId(self,html):
pat='"last":"(.*?)"'
lastId = re.compile(pat,re.S).findall(html)[0]
return lastId
if __name__ == '__main__':
# 初始页面
page = 1642400175443
# 初始待刷新页面ID
lastId = "6887938427630994833"
for i in range(1, 11):
url = "https://coral.qq.com/article/7633748161/comment/v2?callback=_article7633748161commentv2&orinum=10&oriorder=o&pageflag=1&cursor=" + lastId + "&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=1&_=" + str(
page)
test = tengxunTest(url=url)
html = test.get_response().html
# 获取评论者id
commentid_list = test.get_comment_user(html)
# 获取评论数据
commentlist = test.get_comment(html)
print("------第" + str(i) + "轮页面评论------")
for j in range(1, len(commentlist)):
print("第" + str(j) + "条评论者id:" + str(commentid_list[j]))
print("第" + str(j) + "条评论:" + str(commentlist[j]))
for index, comment in enumerate(commentlist):
with open(r"comment_" + str(i) + "_" + str(index) + ".txt", "w", encoding='utf-8') as f:
f.write(str(comment))
# 获取下一轮刷新页ID
lastId = test.get_lastId(html)
page += 1