这篇文章主要介绍了python scrapy爬虫代码及填坑,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下
涉及到详情页爬取
目录结构:
kaoshi_bqg.py
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from ..items import BookBQGItem
class KaoshiBqgSpider(scrapy.Spider):
name = 'kaoshi_bqg'
allowed_domains = ['biquge5200.cc']
start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
rules = (
# 编写匹配文章列表的规则
Rule(LinkExtractor(allow=r'https://www.biquge5200.cc/xuanhuanxiaoshuo/'), follow=True),
# 匹配文章详情
Rule(LinkExtractor(allow=r'.+/[0-9]{1-3}_[0-9]{2-6}/'), callback='parse_item', follow=False),
)
# 小书书名
def parse(self, response):
a_list = response.xpath('//*[@id="newscontent"]/div[1]/ul//li//span[1]/a')
for li in a_list:
name = li.xpath(".//text()").get()
detail_url = li.xpath(".//@href").get()
yield scrapy.Request(url=detail_url, callback=self.parse_book, meta={'info': name})
# 单本书所有的章节名
def parse_book(self, response):
name = response.meta.get('info')
list_a