笔记:scrapy爬取的数据存入MySQL,MongoDB

本文介绍了一个使用Scrapy爬虫抓取网页数据,并分别将数据存储到MongoDB和MySQL数据库中的实例。具体包括了爬虫定义、数据项结构、设置配置以及数据管道等关键部分。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

使用python:2.7.12

一、MongoDB

        一个小例子

1.spider:dmoz_item.py

from dmoz.items import DmozItem

class DmozItemSpider(scrapy.Spider):

    name = "dmoz_item"
    #allowed_domains = ["dmoz.org"]
    start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/']

    def parse(self, response):
        list=response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]')
        for i in list:
            item=DmozItem()
            item['link']=i.xpath('a/@href').extract()
            item['title']=i.xpath('a/div/text()').extract()
            item['desc']=i.xpath('div/text()').extract()
            yield item

2.items: items.py

import scrapy

class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    desc=scrapy.Field()
    link=scrapy.Field()

主要的上菜:
3.setting:settings.py

ITEM_PIPELINES = {
   'dmoz.pipelines.DmozPipeline': 300,
}

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'spider1'
MONGODB_DOCNAME = 'book_item'

4.最后是:pipelines pipelines.py
注意:这种方式下,是从scrapy.conf中import settings

from scrapy.conf import settings
import pymongo

class DmozPipeline(object):
    # def process_item(self, item, spider):
    #     return item
    def __init__(self):
        port = settings['MONGODB_PORT']
        host = settings['MONGODB_HOST']
        db_name = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[db_name]
        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
         book_info = dict(item)
         self.post.insert(book_info)
         return item

二、mysql 一个小例子

1.spider: xicidaili.py

# -*- coding: utf-8 -*-
import scrapy
from xiciip.items import XiciipItem

class XicidailiSpider(scrapy.Spider):
    name = "xicidaili"
    allowed_domains = ["xicidaili.com"]
    #start_urls = ['http://zhangjiakou.ganji.com']

    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    }

    def start_requests(self):
        reqs=[]

        for i in range(1,3):
            req=scrapy.Request("http://www.xicidaili.com/nn/%s"%i,headers=self.headers)
            reqs.append(req)

        return reqs


    def parse(self, response):
        print ("hahahahahhahah"+response.url)

        pre_item=XiciipItem()
        # pre_item['url']=response.url
        # return pre_item
        ip_list=response.xpath('//table[@id="ip_list"]')

        trs=ip_list[0].xpath('tr')

        items=[]
####string(td[4])   抽取字符串
        for i in trs[1:]:
            pre_item=XiciipItem()
            pre_item["ip"]=i.xpath('td[2]/text()')[0].extract()
            pre_item["port"]=i.xpath('td[3]/text()')[0].extract()
            pre_item["position"]=i.xpath('string(td[4])')[0].extract().strip()
            pre_item["type"]=i.xpath('td[6]/text()')[0].extract()

#####正则取    \. 表示. \d
            pre_item["speed"]=i.xpath('td[7]/div[@class="bar"]/@title').re('\d{0,}\.\d{0,}')[0]
            pre_item["last_check_time"]=i.xpath('td[9]/text()')[0].extract()
            items.append(pre_item)
        return items

2.item.py

import scrapy


class XiciipItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    ip=scrapy.Field()
    port=scrapy.Field()
    position=scrapy.Field()
    type=scrapy.Field()
    speed=scrapy.Field()
    last_check_time=scrapy.Field()

3.主菜上了 settings.py

MYSQL_HOSTS = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '******'
#MYSQL_PORT = settings.MYSQL_PORT
MYSQL_DB='xiciip'
CHARSET='utf8'


ITEM_PIPELINES = {
   'xiciip.pipelines.XiciipPipeline': 300,
}

4.pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import MySQLdb

####注意
from scrapy.conf import settings


class XiciipPipeline(object):
    def process_item(self, item, spider):

        # DBKWARGS=spider.settings.get('DBKWARGS')
        # con=MySQLdb.connect(**DBKWARGS)


        host = settings['MYSQL_HOSTS']
        user = settings['MYSQL_USER']
        psd = settings['MYSQL_PASSWORD']
        db = settings['MYSQL_DB']
        c=settings['CHARSET']
#使用的方法2.
        con = MySQLdb.connect(host=host,user=user,passwd=psd,db=db,charset=c)
        #可以使用的方法1
        #con = MySQLdb.connect(host='127.0.0.1',user='root',passwd='******',db='xiciip',charset='utf8')
        cur=con.cursor()
        sql=("insert into proxy(ip,port,position,type,speed,last_check_time) "
             "values(%s,%s,%s,%s,%s,%s)")
    #    sql=('insert into p1(url) values("%s")')
        #sql="insert into p1 values (%s)"
        #list=(item['url'].split(':')[0])
        #list=[item['url']]
        #print('wwwwwwwwwwwwwwww',list,type(list),type('h'))
        list=[item['ip'],item['port'],item['position'],item['type'],item['speed'],item['last_check_time']]

        try:
            cur.execute(sql,list)
        except Exception,e:
            print('Insert error',e)
            con.rollback()

        else:
            con.commit()

        cur.close()
        con.close()

        return item
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值