要求:
爬取网页内所有播放列表
今日股市(王斌)
老曹说股
老丁说股
import scrapy
from scrapy import Request, signals
import pandas as pd
from album.items import AlbumItem
class ExampleSpider(scrapy.Spider):
name = 'example'
def __init__(self, **kwargs):
super(ExampleSpider, self).__init__(**kwargs)
self.data = list()
self.totalpage = 2
self.user = [
{
'albumId': 2881558,
'albumN': '今日股市(王斌)'
},
{
'albumId': 4282711,
'albumN': '老曹说股'
},
{
'albumId': 4282711,
'albumN': '老丁说股'
}]
def start_requests(self):
for uu in self.user:
dic= uu
self.logger.info(f'dic{dic}')
dic['page']=1
url = f"https://www.ximalaya.com/revision/album/v1/getTracksList?albumId={dic['albumId']}&pageNum=1"
yield Request(url, callback=self.parse, cb_kwargs=dic)
def parse(self, response,**kwargs):
lst=response.json()["data"]["tracks"]
if kwargs["page"]==1:
self.totalpage =response.json()["data"]["pageSize"]
elif kwargs['page']+1>self.totalpage :
return
for i in lst:
id=i["trackId"]
title=i["title"]
name=kwargs["albumN"]
item={
"author": name,
"title":title,
}
self.data.append(item)
#减少函数耦合,范围代替传参 执行第一页的时候也可以执行
for k in range(2,self.totalpage):
kwargs["page"]=k
url=f'https://www.ximalaya.com/revision/album/v1/getTracksList?albumId={kwargs["albumId"]}&pageNum={k}'
yield Request(url=url, callback=self.parse, cb_kwargs=kwargs)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(**kwargs)
spider._set_crawler(crawler)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
print(f'self.data{self.data}')
self.logger.info(f'一共 {len(self.data)} 条数据')
output = '喜马拉雅.xlsx'
with pd.ExcelWriter(output) as writer:
df_new = pd.DataFrame(self.data)
df_new.to_excel(writer, sheet_name='data', index=False)
spider.logger.info('生成报表成功: %s', output)