python爬虫scrapy连接mongodb_Python爬虫框架Scrapy实例(三)数据存储到MongoDB

Python爬虫框架Scrapy实例(三)数据存储到MongoDB

任务目标:爬取豆瓣电影top250,将数据存储到MongoDB中。

items.py文件

复制代码

# -*- coding: utf-8 -*-

import scrapy

class DoubanItem(scrapy.Item):

# define the fields for your item here like:

# 电影名

title = scrapy.Field()

# 基本信息

bd = scrapy.Field()

# 评分

star = scrapy.Field()

# 简介

quote = scrapy.Field()

复制代码

spiders文件

复制代码

# -*- coding: utf-8 -*-

import scrapy

from douban.items import DoubanItem

class DoubanmovieSpider(scrapy.Spider):

name = "doubanmovie"

allowed_domains = ["movie.douban.com"]

offset = 0

url = "https://movie.douban.com/top250?start="

start_urls = (

url + str(offset),

)

def parse(self, response):

item = DoubanItem()

movies = response.xpath('//div[@class="info"]')

for each in movies:

# 电影名

item['title'] = each.xpath('.//span[@class="title"][1]/text()').extract()[0]

# 基本信息

item['bd'] = each.xpath('.//div[@class="bd"]/p/text()').extract()[0]

# 评分

item['star'] = each.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]

# 简介

quote = each.xpath('.//p[@class="quote"]/span/text()').extract()

if len(quote) != 0:

item['quote'] = quote[0]

yield item

if self.offset < 225:

self.offset += 25

yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

复制代码

pipelines.py文件

复制代码

# -*- coding: utf-8 -*-

import pymongo

from scrapy.conf import settings

class DoubanPipeline(object):

def __init__(self):

host = settings["MONGODB_HOST"]

port = settings["MONGODB_PORT"]

dbname = settings["MONGODB_DBNAME"]

sheetname = settings["MONGODB_SHEETNAME"]

# 创建MONGODB数据库链接

client = pymongo.MongoClient(host=host, port=port)

# 指定数据库

mydb = client[dbname]

# 存放数据的数据库表名

self.post = mydb[sheetname]

def process_item(self, item, spider):

data = dict(item)

self.post.insert(data)

return item

复制代码

settings.py文件

复制代码

# -*- coding: utf-8 -*-

BOT_NAME = 'douban'

SPIDER_MODULES = ['douban.spiders']

NEWSPIDER_MODULE = 'douban.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"

ITEM_PIPELINES = {

'douban.pipelines.DoubanPipeline': 300,

}

# MONGODB 主机名

MONGODB_HOST = "127.0.0.1"

# MONGODB 端口号

MONGODB_PORT = 27017

# 数据库名称

MONGODB_DBNAME = "Douban"

# 存放数据的表名称

MONGODB_SHEETNAME = "doubanmovies"

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值