一.准备url
要爬取的B站的鬼畜部分,选择了教程演示(视频个数比较少),在搜索里进行页面上信息的搜索来确定url
- 得到url:
https://api.bilibili.com/x/web-interface/newlist?callback=jqueryCallback_bili_8038458089269198&rid=127&type=0&pn=1&ps=20&jsonp=jsonp&_=1589855603296
- url 进行处理
对url 进行处理,去掉callback
然后搜索此url
用Chrome 的json View 插件处理
我们要的信息在这个url 中,通过翻页,观察到
B站一个页面的视频个数是20个,num是页码,count 是总的视频个数,这样就可以进行翻页。
def __init__(self):
self.url_temp = "https://api.bilibili.com/x/web-interface/newlist?&rid=127&type=0&pn={}&ps=20&jsonp=jsonp&_=1589855603296"
self.headers = headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"
}
def get_fanye(self,i):
url = self.url_temp.format(i)
json_str = self.get_html(url)
ret = json.loads(json_str)
a = int(ret["data"]["page"]["num"])
b = int(ret["data"]["page"]["size"])
c = int(ret["data"]["page"]["count"])
return a,b,c,ret
二.发送请求,获取响应
def get_html(self,url):
json_str =requests.get(url,headers =self.headers)
return json_str.content.decode()
三.提取数据
def get_data(self,ret):
content_list= []
for i in range(0,len(ret["data"]["archives"])):
item={}
item["title"]=ret["data"]["archives"][i]["title"]
item["owner"]=ret["data"]["archives"][i]["owner"]["name"]
item["view_time"] = ret["data"]["archives"][i]["stat"]["view"]
content_list.append(item)
return content_list
四.保存
def save_data(self,content_list):
with open("BI1.txt", "a", encoding="utf-8") as f:
for i in content_list:
f.write(json.dumps(i,ensure_ascii=False))
f.write("\n")
print("保存成功")
完整代码:
import requests
import json
import re
class BiliBili():
def __init__(self):
self.url_temp = "https://api.bilibili.com/x/web-interface/newlist?&rid=127&type=0&pn={}&ps=20&jsonp=jsonp&_=1589855603296"
self.headers = headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"
}
# def get_url_list(self):
# url_list=[]
# for i in range(1,21):
# url_list.append(self.url_temp.format(i))
# return url_list
def get_fanye(self,i):
url = self.url_temp.format(i)
json_str = self.get_html(url)
ret = json.loads(json_str)
a = int(ret["data"]["page"]["num"])
b = int(ret["data"]["page"]["size"])
c = int(ret["data"]["page"]["count"])
return a,b,c,ret
def get_html(self,url):
json_str =requests.get(url,headers =self.headers)
return json_str.content.decode()
def get_data(self,ret):
content_list= []
for i in range(0,len(ret["data"]["archives"])):
item={}
item["title"]=ret["data"]["archives"][i]["title"]
item["owner"]=ret["data"]["archives"][i]["owner"]["name"]
item["view_time"] = ret["data"]["archives"][i]["stat"]["view"]
content_list.append(item)
return content_list
def save_data(self,content_list):
with open("BI1.txt", "a", encoding="utf-8") as f:
for i in content_list:
f.write(json.dumps(i,ensure_ascii=False))
f.write("\n")
print("保存成功")
def run(self):
#1.start_url
# 2.获得相应,
i=1
a,b,c,ret =self.get_fanye(i)
while True:
if a * b < c:
# 3,提取数据
a,b,c,ret=self.get_fanye(i)
content_list = self.get_data(ret)
# 4,保存
self.save_data(content_list)
else:
break
i=i+1
if __name__ == '__main__':
BiliBili=BiliBili()
BiliBili.run()