import scrapy
import json
from scrapy_01.items import Bilibili_danceItem
'''
爬取bilibili舞蹈区宅舞部分,
按照时间排序,抓取一页20条,100页用于测试
爬取标题、简介、作者名字、bv号 其他的没取。
'''
class BilibiliDanceSpider(scrapy.Spider):
name = 'bilibili_dance'
allowed_domains = ['bilibili.com']
##基础url
baseurl = "https://api.bilibili.com/x/web-interface/newlist?rid=20&type=0&pn="
##url偏移量,也就是页码
offset = 1
#结尾表明一页20条记录,如果不写默认50条
endend = "&ps=20"
start_urls = [baseurl+str(offset)+endend]
def parse(self, response):
items = Bilibili_danceItem()
#把str类型转换成json格式读取
#写出的时候用dumps,转换成str
string = json.loads(response.body)['data']['archives']
for s in string:
items['title'] = s['title']
# 简介
items['desc'] = s['desc']
# 作者名字
items['owner_name'] = s['owner']['name']
# bv号
items['bvid'] = s['bvid']
yield items
#爬取100页
if(self.offset<100):
self.offset += 1
url1 = self.baseurl+str(self.offset)+self.endend
#输出当前处理的url
print(url1)
yield scrapy.Request(url1,callback=self.parse)
# 执行命令 ,因为没有写管道文件,所以使用-o直接保存为json文件,用-s保存为utf-8.--nolog是不输出日记
scrapy crawl bilibili_dance -o bilibili_dance.json -s FEED_EXPORT_ENCODING=UTF-8 --nolog
部分json,下面的json的格式,只读取的一部分,用于测试
可以进去这个链接查看完整的json
https://api.bilibili.com/x/web-interface/newlist?rid=20&type=0&pn=1
{
"code":0,
"message":"0",
"ttl":1,
"data":{
"archives":[
{
"aid":330212085,
"videos":3,
"tid":20,
"tname":"宅舞",
"copyright":2,
"pic":"http://i0.hdslb.com/bfs/archive/6d07252cd3d7088c0ef4276032cc2862b6246555.jpg",
"title":"penta!!!的生日会直播录屏",
"pubdate":1604392412,
"ctime":1604392412,
"desc":"大小姐生日快乐!!!!!",
"state":0,
"duration":1033,
"rights":{
"bp":0,
"elec":0,
"download":0,
"movie":0,
"pay":0,
"hd5":0,
"no_reprint":0,
"autoplay":1,
"ugc_pay":0,
"is_cooperation":0,
"ugc_pay_preview":0,
"no_background":0
},
"owner":{
"mid":621989512,
"name":"茶miumiu",
"face":"http://i1.hdslb.com/bfs/face/7ace388fd70a21dc03452d0423003605d35df0df.jpg"
},
"stat":{
"aid":330212085,
"view":6,
"danmaku":0,
"reply":0,
"favorite":0,
"coin":0,
"share":0,
"now_rank":0,
"his_rank":0,
"like":1,
"dislike":0
},
"dynamic":"",
"cid":252194538,
"dimension":{
"width":1920,
"height":1080,
"rotate":0
},
"bvid":"BV1CA411j7LS"
},
.....