Scrapy 爬取盗墓笔记小说

Scrapy 爬取盗墓笔记小说

应用 Scrapy框架 爬取盗墓笔记小说数据,存入MongoDB 数据库。

# settings 配置mongodb
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'MySpider'
MONGODB_DOCNAME = 'daomubiji'
# items 配置抓取数据字段
import scrapy

class NovelItem(scrapy.Item):
    bookName = scrapy.Field()
    bookTitle = scrapy.Field()
    chapterNum = scrapy.Field()
    chapterName = scrapy.Field()
    chapterUrl = scrapy.Field()
# spider 抓取数据
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from novel.items import NovelItem

class DaomubijiSpider(CrawlSpider):
    name = 'daomubiji'
    allowed_domains = ['daomubiji.com']
    start_urls = ['http://www.daomubiji.com/']

    def parse_start_url(self, response):
        pass

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//article[@class="article-content"]//a'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = NovelItem()
        list = response.xpath('//body')
        for listItem in list:
            item['bookName'] = listItem.xpath('.//h1[@class="focusbox-title"]/text()').get().split(':')[0]
            subList = listItem.xpath('.//div[@class="excerpts"]//article')
            for subListItem in subList:
                item['bookTitle'] = subListItem.xpath('.//a/text()').get().split(' ')[0]
                item['chapterNum'] = subListItem.xpath('.//a/text()').get().split(' ')[1]
                item['chapterName'] = subListItem.xpath('.//a/text()').get().split(' ')[2]
                item['chapterUrl'] = subListItem.xpath('.//a/@href').get()
                yield item

# pipeline 处理数据
from scrapy.conf import settings
import pymongo

class NovelPipeline(object):

    def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbName = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[dbName]
        self.post = db[settings['MONGODB_DOCNAME']]

    def open_spider(self, spider):
        print('This spider is starting!')

    def process_item(self, item, spider):
        bookInfo = dict(item)
        self.post.insert(bookInfo)
        return item

    def close_spider(self, spider):
        print('This spider is end!')
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值