Scrapy爬虫数据存到mongodb中
其实主要是前两步
1、在settings.py中进行配置
ITEM_PIPELINES = {
'dmoz.pipelines.DmozPipeline': 300,
}
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'spider1'
MONGODB_DOCNAME = 'book_item'
2、修改pipeline文件
首先初始化获得settings中的mongodb数据库信息,在process_item提交插入数据。
注意这个settings是哪里的 x.conf
from scrapy.conf import settings
import pymongo
class DmozPipeline(object):
# def process_item(self, item, spider):
# return item
def __init__(self):
port = settings['MONGODB_PORT']
host = settings['MONGODB_HOST']
db_name = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
db = client[db_name]
self.post = db[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
book_info = dict(item)
self.post.insert(book_info)
return item
3.dmoz_item.py
import scrapy
from dmoz.items import DmozItem
class DmozItemSpider(scrapy.Spider):
name = "dmoz_item"
#allowed_domains = ["dmoz.org"]
start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/']
# name = "dmoz_item"
# allowed_domains = ["domz.org"]
# start_urls = ['http://domz.org/']
def parse(self, response):
list=response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]')
for i in list:
item=DmozItem()
item['link']=i.xpath('a/@href').extract()
item['title']=i.xpath('a/div/text()').extract()
item['desc']=i.xpath('div/text()').extract()
yield item
4.items.py
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
desc=scrapy.Field()
link=scrapy.Field()
运行这个爬虫就OK了