首先安装依赖的包:
pip install simplified-scrapy
Python代码如下:
import io,json
from simplified_scrapy.spider import Spider, SimplifiedDoc
from simplified_scrapy.core.utils import getTimeNow,printInfo,appendFile
class TianshuSpider(Spider):
concurrencyPer1s=1
name = 'tianshu-spider'
start_urls = []#在初始化函数中初始化start_urls
def __init__(self):
i=1
while(i<=33):
self.start_urls.append('https://m.biqudao.cc/0/779_{}/'.format(i))
i+=1
Spider.__init__(self,self.name) #necessary,调用基类初始化方法完成系统初始化
#重写抽取方法
def extract(self, url, html, models, modelNames):
try:
html = self.removeScripts(html)
lst=[]
data=[]
ele = None
#判断页面是列表还是小说内容
if(url["url"].find('https://m.biqudao.cc/0/779_')==0):
doc = SimplifiedDoc(html)
lstA = doc.listA(url[