爬虫学习,尝试爬取小说网站

初步学习了scrapy 尝试着爬取小说网站全部小说(发现没问题就停了)
以下为scrapy中基础爬虫的写法

# -*- coding: utf-8 -*-
import scrapy
from scrapy.shell import inspect_response   #测试
import re
from xiaoshuo.items import XiaoshuoItem


class Ba9dxSpider(scrapy.Spider):
    name = 'ba9dx'
    allowed_domains = ['9dxs.com']
    start_urls = ['https://www.9dxs.com/xuanhuan_1.html']
    url = 'https://www.9dxs.com/'

    def parse(self, response):
        txtlist = response.xpath('//p[@class="articlename"]/a/@href').extract()
        for each in txtlist:
            nextpage = each.split('/',maxsplit=2)[-1]
            nextpage = self.url + nextpage
            yield scrapy.Request(nextpage,callback=self.parse_1)
        next_page = response.xpath('//div[@class="pagelink"]/a[@class="next"]/@href').extract_first()
        if next_page is not None:
            nextpage = self.url + next_page.split('/')[-1]
            yield scrapy.Request(next_page,callback=self.parse)

    def parse_1(self,response):
        #inspect_response(response,self)
        pageurl = response.xpath('//div[@class="top"]/a/@href').extract_first()
        # print(pageurl)
        pageurl = 'https://www.9dxs.com' + pageurl
        yield scrapy.Request(pageurl,callback=self.parse_2)
                                      #https://www.9dxs.com/2/2773/index.html
    def parse_2(self,response):
        # inspect_response(response,self)
        pagelist = response.xpath('//div[@class="chapterlist"]/ul/li/a/@href').extract()
        for each in pagelist:
            niubi = re.findall(r'https://www\.9dxs\.com/\d+?/\d+?/',response.url)[0] + each
            yield scrapy.Request(niubi,callback=self.parse_3)

    def parse_3(self,response):
        #inspect_response(response,self)
        item = XiaoshuoItem()
        item['title'] = response.xpath('//h1/text()').extract_first().split(' ',maxsplit=1)[-1]
        item['text'] = response.xpath('string(//div[@id="content"])').extract_first()
        yield item

之前也尝试着用crawlspider写这段代码,毕竟其遍历的能力会使代码简洁很多.但是实际操作中发现同样的链接,下载下来后用正则可以匹配到,但是在爬取过程中同样的正则却搜不到相应的链接,但还是贴上自己的代码,望大佬能够指正.

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from xiaoshuo.items import XiaoshuoItem

class A9dxsSpider(CrawlSpider):
    name = 'a9dxs'
    allowed_domains = ['9dxs.com']
    start_urls = ['https://www.9dxs.com/xuanhuan_1.html']
    #主界面小说地址
    pagelist1 = LinkExtractor(allow=(r'/\./\d+/\d+/'))
    #主界面页码地址
    pagelist2 = LinkExtractor(allow=(r'/\./xuanhuan_\d+\.html'))

    rules = [
        Rule(pagelist1, callback='parse_item', follow=True),
        Rule(pagelist2,process_links='process_link1',follow=True)
    ]

    def process_link1(self,links):
        url = 'https://www.9dxs.com/' + links.split('/')[-1] 
        print(url)
        return url

    def parse_item(self, response):
        url = response.xpath('//div[@class="top"]/a/@href').extract_first()
        url = response.urljoin(url)
        print(response.body)
        yield scrapy.Request(url,callback='parse_next')

    def parse_next(self,response):
        chapterlist = response.xpath('//div[@class="chapterlist"]/ul/li/a/@href').extract()
        for each in chapterlist:
            chapter = response.urljoin(each)
            yield scrapy.Request(chapter,callback='parse_text')

    def parse_text(self,response):
        item = XiaoshuoItem()
        item['title'] = response.xpath('//h1/text()').extract_first().split(' ',maxsplit=1)[-1]
        item['text'] = response.xpath('string(//p)').extract_first()
        yield item


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值