初步学习了scrapy 尝试着爬取小说网站全部小说(发现没问题就停了)
以下为scrapy中基础爬虫的写法
# -*- coding: utf-8 -*-
import scrapy
from scrapy.shell import inspect_response #测试
import re
from xiaoshuo.items import XiaoshuoItem
class Ba9dxSpider(scrapy.Spider):
name = 'ba9dx'
allowed_domains = ['9dxs.com']
start_urls = ['https://www.9dxs.com/xuanhuan_1.html']
url = 'https://www.9dxs.com/'
def parse(self, response):
txtlist = response.xpath('//p[@class="articlename"]/a/@href').extract()
for each in txtlist:
nextpage = each.split('/',maxsplit=2)[-1]
nextpage = self.url + nextpage
yield scrapy.Request(nextpage,callback=self.parse_1)
next_page = response.xpath('//div[@class="pagelink"]/a[@class="next"]/@href').extract_first()
if next_page is not None:
nextpage = self.url + next_page.split('/')[-1]
yield scrapy.Request(next_page,callback=self.parse)
def parse_1(self,response):
#inspect_response(response,self)
pageurl = response.xpath('//div[@class="top"]/a/@href').extract_first()
# print(pageurl)
pageurl = 'https://www.9dxs.com' + pageurl
yield scrapy.Request(pageurl,callback=self.parse_2)
#https://www.9dxs.com/2/2773/index.html
def parse_2(self,response):
# inspect_response(response,self)
pagelist = response.xpath('//div[@class="chapterlist"]/ul/li/a/@href').extract()
for each in pagelist:
niubi = re.findall(r'https://www\.9dxs\.com/\d+?/\d+?/',response.url)[0] + each
yield scrapy.Request(niubi,callback=self.parse_3)
def parse_3(self,response):
#inspect_response(response,self)
item = XiaoshuoItem()
item['title'] = response.xpath('//h1/text()').extract_first().split(' ',maxsplit=1)[-1]
item['text'] = response.xpath('string(//div[@id="content"])').extract_first()
yield item
之前也尝试着用crawlspider写这段代码,毕竟其遍历的能力会使代码简洁很多.但是实际操作中发现同样的链接,下载下来后用正则可以匹配到,但是在爬取过程中同样的正则却搜不到相应的链接,但还是贴上自己的代码,望大佬能够指正.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from xiaoshuo.items import XiaoshuoItem
class A9dxsSpider(CrawlSpider):
name = 'a9dxs'
allowed_domains = ['9dxs.com']
start_urls = ['https://www.9dxs.com/xuanhuan_1.html']
#主界面小说地址
pagelist1 = LinkExtractor(allow=(r'/\./\d+/\d+/'))
#主界面页码地址
pagelist2 = LinkExtractor(allow=(r'/\./xuanhuan_\d+\.html'))
rules = [
Rule(pagelist1, callback='parse_item', follow=True),
Rule(pagelist2,process_links='process_link1',follow=True)
]
def process_link1(self,links):
url = 'https://www.9dxs.com/' + links.split('/')[-1]
print(url)
return url
def parse_item(self, response):
url = response.xpath('//div[@class="top"]/a/@href').extract_first()
url = response.urljoin(url)
print(response.body)
yield scrapy.Request(url,callback='parse_next')
def parse_next(self,response):
chapterlist = response.xpath('//div[@class="chapterlist"]/ul/li/a/@href').extract()
for each in chapterlist:
chapter = response.urljoin(each)
yield scrapy.Request(chapter,callback='parse_text')
def parse_text(self,response):
item = XiaoshuoItem()
item['title'] = response.xpath('//h1/text()').extract_first().split(' ',maxsplit=1)[-1]
item['text'] = response.xpath('string(//p)').extract_first()
yield item