![在这里插入图片描述](https://img-blog.csdnimg.cn/20191219154455388.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl8zOTIzODUyMA==,size_16,color_FFFFFF,t_70)
流程和爬职友的一样,这里只有top250的爬虫
这里的第2页,直接在start_urls的位置循环出来
import scrapy
import bs4
from ..items import SdoubanItem
class Sdouban(scrapy.Spider):
name="sdouban"
allowed_domins=['https://book.douban.com']
start_urls=[]
for x in range(0,100,25):
start_urls.append('https://book.douban.com/top250?start='+str(x))
def parse(self,response):
bs=bs4.BeautifulSoup(response.text,'html.parser')
data_list=bs.find_all('tr',class_='item')
for data in data_list:
real_url=data.find("div",class_='pl2').find('a')['href']+'/blockquotes'
yield scrapy.Request(real_url,callback=self.parse_job)
def parse_job(self,response):
bs=bs4.BeautifulSoup(response.text,'html.parser')
bookname=bs.find(id='content').find('h1').text
item=SdoubanItem()
c_list=bs.find(class_="blockquote-list score bottom-line").find_all('figure')
for data in c_list:
item['bookname']=bookname
item['comment']=data.text
item['commentdetail']=data.find('a')['href']
yield item