# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy import Item,Field
from ..items import BookItem
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['aiyadu.com']
# start_urls = ['http://aiyadu.com/']
#当前页数
current_page = 1
#设置用户代理为浏览器类型
aiyadu_headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36"}
def start_requests(self):
url = "http://www.aiyadu.com/page/1"
yield scrapy.Request(url, headers=self.aiyadu_headers, callback=self.parse)
# 书籍列表页面解析
def parse(self, response):
#每本书的链接:
#使用xpath定位到小说内容的div元素
list_selector = response.xpath("//div[@class='excerpts']//h2/a/@href")
#读取每部小说的元素
for one_selector in list_selector:
#获取链接,链接为绝对地址,如http://www.aiyadu.com/14777.html
urltemp1 = one_selector.extract()
print('urltemp1 is ' ,urltemp1)
# 构建为详细页地址,http://www.aiyadu.com/wp-content/plugins/ordown/down.php?id=14777
tail = urltemp1.split('/')[-1].split('.')[0]
urltemp2 ='http://www.aiyadu.com/wp-content/plugins/ordown/down.php?id=' + tail
#print(urltemp2)
yield scrapy.Request(urltemp2,callback=self.parse_book)
#多页数据爬取,原理:执行完一次爬取,当前页数加1,然后通过回调函数重新执行parse方法
self.current_page+=1
if(self.current_page < 200):
next_url = "http://www.aiyadu.com/page/%d"%(self.current_page)
yield scrapy.Request(next_url, headers=self.aiyadu_headers, callback=self.parse)
# 书籍页面解析
def parse_book(self, response):
book=BookItem()
book['name']=response.xpath('//div[@class="ordown-header"]//h2//text()').extract()
book['url']=response.xpath('//div[@class="ordown-header"]//a[@class="ordown-button"]/@href').extract()
print(book['name'],book['url'])
yield book