速度是相当的快的
爬取整站的小说
最后结果保存至mongodb数据库
pycharm开发还是很好用的
创建项目:scrapy startproject daomubiji
运行项目:scrapy crawl daomubi
settings
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3738.400'
}
items
title = scrapy.Field()
section = scrapy.Field()
content = scrapy.Field()
spider
import scrapy
from daomubiji.items import DaomubijiItem
class DaomubiSpider(scrapy.Spider):
name = 'daomubi'
allowed_domains = ['daomubiji.com']
start_urls = ['http://www.daomubiji.com/dao-mu-bi-ji-2']
def start_requests(self):
for i in range(1, 9):
yield scrapy.Request('http://www.daomubiji.com/dao-mu-bi-ji-{}'.format(i), callback=self.parse)
def parse(self, response):
item = DaomubijiItem()
item['title'] = response.xpath('//h1[@class="focusbox-title"]/text()').extract_first()#提取小说名
items = response.xpath('//article[@class="excerpt excerpt-c3"]')
for href in items:
detail_href = href.xpath('./a/@href').extract_first()#提取章节的正文链接
yield scrapy.Request(url=detail_href, meta={'item': item}, callback=self.get_content)
def get_content(self, response):
item = response.meta['item']
item['section'] = response.xpath('//h1[@class="article-title"]/text()').extract_first()#提取小说的章节名
pages = response.xpath('//article[@class="article-content"]//p/text()').extract()#解析正文
item['content'] = ''.join(pages)
yield item
pipelines
from scrapy.item import Item
import pymongo
class DaomubijiPipeline(object):
def open_spider(self, spider):
# 连接数据库
self.client = pymongo.MongoClient(host='localhost', port=27017)
# 创建myspider数据库
self.db = self.client.daomubiji
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
collection = self.db.spider.xiaoshuo
post = dict(item) if isinstance(item, Item) else item
collection.insert_one(post)
return item