根据上章scrapy爬取小说(二)
爬取的小说的章节是乱序的,所以需要章节排序。
使用Item和Pipeline对最终结果排序
修改items.py文件
import scrapy
class NovelItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
num = scrapy.Field()
name = scrapy.Field()
content = scrapy.Field()
修改yuanzun.py文件
import scrapy
from novel.items import NovelItem
class YuanZun(scrapy.Spider):
name = "yuanzun"
start_urls = ['https://www.booktxt.net/6_6453/']
def parse(self, response):
# 获取所有子页面
for quote in response.css("div#list dd a"):
next_page = quote.css("a::attr(href)").get()
if next_page is not None:
yield response.follow(next_page, self.parse_content)
# 抽取每个页面的标题和内容
def parse_content(self, response):
item = NovelItem()
item['name'] = response.css("div.bookname h1::text").get()
item['content'] = response.css("div#content::text").getall()
yield item
返回结果为NovelItem类
修改pipelines.py文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
class NovelPipeline(object):
def __init__(self):
self.num_enum = {
'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '两': 2
}
self.multi_cov = {'百': 100, '十': 10}
self.content_list = []
def open_spider(self, spider):
self.file = open('yuanzun.txt', 'w')
def process_item(self, item, spider):
name = item['name']
chapter = re.findall(r"第(.*)章", name)[0]
item['num'] = self.change2num(chapter)
self.content_list.append(item)
return item
def close_spider(self, spider):
list_sorted = sorted(self.content_list, key=lambda x: x['num'])
for item in list_sorted:
self.file.write("----------------%d------------------ %s--------------\n" % (item['num'], item['name']))
self.file.write(''.join(item['content']).replace('\xa0', '') + "\n")
self.file.close()
#章节数字转换
def change2num(self, name):
m = 0
mc = 1
rev_name = name[::-1]
for t_str in rev_name:
if t_str in self.num_enum:
m += self.num_enum[t_str] * mc
if t_str in self.multi_cov:
mc = self.multi_cov[t_str]
# 第十二章,第十章特例
if name[0] == '十':
m += 10
return m
说明:change2num是自定义的方法,例如将“一百二十五”转换为数字125,
process_item方法是讲每个NovelItem的name的中文数字抽取出来。
close_spider保存文件时候,将按num从小到大排序保存。
在settings.py取消ITEM_PIPELINES的注解
ITEM_PIPELINES = {
'novel.pipelines.NovelPipeline': 300,
}
运行命令
scrapy crawl yuanzun
查看yuanzun.txt文件,章节是有序的。