import urllib.request
import urllib.parse
import json
# 解决证书信任问题
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# headers 通过具体抓包设置
headers ={}
video_page_num = 1
while True:
video_page_url = ""
"""
很漂亮的爬虫语句
urllib.request.Request(url, headers)
urllib.request.urlopen(...)
.read()
.decode("utf-8")
json.loads(...)
"""
temp_data = json.loads(urllib.request.urlopen(urllib.request.Request(url=video_page_url, headers=headers)).read().decode("utf-8"))["data"]
for eve_video in temp_data:
oid_data = eve_video["param"]
page_num = 1
while True:
comment_url = ""
comment_data = json.loads(urllib.request.urlopen(urllib.request.Request(url=comment_url, headers=headers)).read().decode("utf-8"))
acount = int(comment_data["data"]["page"]["acount"])
if int(acount/20) + 1 <= page_num:
break
else:
page_num = page_num + 1
for eve_comment in comment_data["data"]["replies"]:
eve_comment["video_data"] = eve_video
print(eve_comment)
if len(temp_data) < 50:
break
else:
video_page_num = video_page_num + 1