使用scrapy爬取全本书屋网全站所有小说-CSDN博客

因为过年，所以十几天没有学习技术，今天开始正式学习scrapy，在崔庆才大神教程的帮助下，爬取了全书屋网的全站所有小说的名字，作者，网址，先上代码：

import scrapy
import re
from scrapy.http import Request
from bs4 import BeautifulSoup
from scrapy.selector import Selector
from myuu.items import MyuuItem

class myspider(scrapy.Spider):
    name = 'dmoz'
    ourl = 'http://www.quanshuwu.com/category/'
    turl = '.aspx'
    def start_requests(self):
        for i in range(2,10):
            url = self.ourl + str(i) + '_1_1' + self.turl
            yield Request(url,self.parse)

    def parse(self, response):
        max_num = BeautifulSoup(response.text,'lxml').find(id="storelistbottom" ).find_all('a')[-1].get('href')
        cc = re.compile('/category.*?_1_(.*?).aspx')
        max = re.search(cc,max_num).group(1)
        sz = str(response.url)[:-6]
        for i in range(1,int(max)+1):
            url = sz + str(i) +self.turl
            yield Request(url,callback=self.get_name)
    def get_name(self,response):
        max_num = BeautifulSoup(response.text,'lxml').find_all('li',class_='storelistbt5a')
        for i in max_num:
            name = i.find('strong').find('a').get_text()
            curl = i.find('strong').find('a').get('href')
            url = 'http://www.quanshuwu.com' + curl
            yield Request(url,callback=self.get_index,meta={'url':url,'name':name})
    def get_index(self,response):
        item = MyuuItem()
        item['name'] = response.meta['name']
        item['url'] = response.meta['url']
        author = BeautifulSoup(response.text).select('#bookinfo > div.ti > p')
        item['author'] = author
        #此处就是详细提取的地方了，爬取全站小说可以加一个提取式，因为想偷懒就不提取了qaq

今天算是初步了解了scrapy，感觉还是有点晦涩，相信我进一步了解之后就能了解scrapy的的魅力！