Scrapy 爬取全职高手小说
应用 Scrapy框架 ,爬取全职高手小说数据,存于本地json文件。
# items 配置抓取数据字段
import scrapy
bookName = scrapy.Field()
bookTitle = scrapy.Field()
chapterNum = scrapy.Field()
chapterName = scrapy.Field()
chapterUrl = scrapy.Field()
chapterContent = scrapy.Field()
# spider 抓取数据
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from novel.items import NovelItem
class QuanzhigaoshouSpider(CrawlSpider):
name = 'quanzhigaoshou'
# allowed_domains = ['qu.la']
start_urls = ['https://www.qu.la/book/32/']
def parse_start_url(self, response):
print(response.url)
rules = (
Rule(LinkExtractor(allow=r'/book/32/\d+.html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response.url)
item = NovelItem()
item['bookName'] = response.xpath('//div[@class="con_top"]/a[2]/text()').get()
title = response.xpath('//div[@class="bookname"]/h1/text()').get().split(' ', 1)
if len(title) == 1:
item['chapterNum'] = ''
item['chapterName'] = title[0]
elif len(title) == 2:
item['chapterNum'] = title[0]
item['chapterName'] = title[1]
item['chapterUrl'] = response.url
item['chapterContent'] = ''.join(response.xpath('//div[contains(@id, "content")]/text()').extract()).strip()
yield item
# pipeline 处理数据
import json
import codecs
import os
class NovelPipeline(object):
def __init__(self):
self.file = codecs.open('quanzhigaoshou.json', 'w', 'utf-8')
self.file.write('[')
def open_spider(self, spider):
print('This spider is starting!')
def process_item(self, item, spider):
if spider.name == 'quanzhigaoshou':
data = json.dumps(dict(item), ensure_ascii=False) + ',\n'
self.file.write(data)
return item
def close_spider(self, spider):
print('This spider is end!')
self.file.seek(-2, os.SEEK_END) # 定位到倒数第二个字符,即最后一个逗号
self.file.truncate() # 删除最后一个逗号
self.file.write(']')
self.file.close()