import scrapy
from ..items import PfdsjItem
class PfsjSpider(scrapy.Spider):
name = 'pfsj'
#allowed_domains = ['xxx.com']
start_urls = ['https://www.pingfandeshijie.net']
#获取一级页面中的a链接地址
def parse(self,response):
#获取目录链接
a_href=response.xpath("//center/table/tr/td/center/h2/a/@href").extract()
# print(a_href)
for v in a_href:
# print(v)
# 将返回的a链接交给调度器进行处理,将处理的结果传递给two_parse
yield scrapy.Request(url=v,callback=self.two_parse)
# 获取二级页面中的a链接地址
def two_parse(self,respond):
# print(respond)
# 获取a链接
a_href=respond.xpath('//div[@class="main"]/div[2]/ul/li/a/@href').extract()
# print(a_href)
for i in a_href:
# 将返回的a链接交给调度器进行处理,将处理的结果传递给three_parse
yield scrapy.Request(url=i,callback=self.three_parse)
# 获取三级页面中的a链接地址
def three_parse(self,respond):
# print(type(book_name))
page=respond.xpath('/html/body/div[3]/h1/text()').get().split()
part=page[0]
if len(page)>1:
page_num=page[1]
else:
page_num = page[0]
content=respond.xpath('//body/div[3]/div[2]/p/text()').extract()
content='\n'.join(content)
# print(content)
item = PfdsjItem()
# 给KugouItem对象属性赋值
item['page_num'] = page_num
item['part'] = part
item['content'] = content.replace('\\u300', '')
yield item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PfdsjPipeline:
# 第一次执行管道类文件的时候执行一次
def open_spider(self,spider):
dirname = './平凡的世界'
if not os.path.exists(dirname):
os.mkdir(dirname)
def process_item(self, item, spider):
dirname = './%s/'%('平凡的世界') + item['part']
if not os.path.exists(dirname):
os.mkdir(dirname)
# 章节名/章节数——标题
filename = "./%s/%s/%s" % ('平凡的世界',item['part'],item['page_num'])
with open(filename + '.txt', 'a', encoding='utf-8') as f:
f.write(item['content'])
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PfdsjItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
book_name = scrapy.Field()
part = scrapy.Field()
page_num = scrapy.Field()
content=scrapy.Field()
所需第三方库:scrapy库
运行结果: