笔记:scrapy爬取的数据存入MySQL,MongoDB

使用python:2.7.12

一、MongoDB

        一个小例子

1.spider:dmoz_item.py

from dmoz.items import DmozItem

class DmozItemSpider(scrapy.Spider):

    name = "dmoz_item"
    #allowed_domains = ["dmoz.org"]
    start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/']

    def parse(self, response):
        list=response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]')
        for i in list:
            item=DmozItem()
            item['link']=i.xpath('a/@href').extract()
            item['title']=i.xpath('a/div/text()').extract()
            item['desc']=i.xpath('div/text()').extract()
            yield item

2.items: items.py

import scrapy

class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    desc=scrapy.Field()
    link=scrapy.Field()

主要的上菜:
3.setting:settings.py

ITEM_PIPELINES = {
   'dmoz.pipelines.DmozPipeline': 300,
}

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'spider1'
MONGODB_DOCNAME = 'book_item'

4.最后是:pipelines pipelines.py
注意:这种方式下,是从scrapy.conf中import settings

from scrapy.conf import settings
import pymongo

class DmozPipeline(object):
    # def process_item(self, item, spider):
    #     return item
    def __init__(self):
        port = settings['MONGODB_PORT']
        host = settings['MONGODB_HOST']
        db_name = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[db_name]
        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
         book_info = dict(item)
         self.post.insert(book_info)
         return item

二、mysql 一个小例子

1.spider: xicidaili.py

# -*- coding: utf-8 -*-
import scrapy
from xiciip.items import XiciipItem

class XicidailiSpider(scrapy.Spider):
    name = "xicidaili"
    allowed_domains = ["xicidaili.com"]
    #start_urls = ['http://zhangjiakou.ganji.com']

    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    }

    def start_requests(self):
        reqs=[]

        for i in range(1,3):
            req=scrapy.Request("http://www.xicidaili.com/nn/%s"%i,headers=self.headers)
            reqs.append(req)

        return reqs


    def parse(self, response):
        print ("hahahahahhahah"+response.url)

        pre_item=XiciipItem()
        # pre_item['url']=response.url
        # return pre_item
        ip_list=response.xpath('//table[@id="ip_list"]')

        trs=ip_list[0].xpath('tr')

        items=[]
####string(td[4])   抽取字符串
        for i in trs[1:]:
            pre_item=XiciipItem()
            pre_item["ip"]=i.xpath('td[2]/text()')[0].extract()
            pre_item["port"]=i.xpath('td[3]/text()')[0].extract()
            pre_item["position"]=i.xpath('string(td[4])')[0].extract().strip()
            pre_item["type"]=i.xpath('td[6]/text()')[0].extract()

#####正则取    \. 表示. \d
            pre_item["speed"]=i.xpath('td[7]/div[@class="bar"]/@title').re('\d{0,}\.\d{0,}')[0]
            pre_item["last_check_time"]=i.xpath('td[9]/text()')[0].extract()
            items.append(pre_item)
        return items

2.item.py

import scrapy


class XiciipItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    ip=scrapy.Field()
    port=scrapy.Field()
    position=scrapy.Field()
    type=scrapy.Field()
    speed=scrapy.Field()
    last_check_time=scrapy.Field()

3.主菜上了 settings.py

MYSQL_HOSTS = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '******'
#MYSQL_PORT = settings.MYSQL_PORT
MYSQL_DB='xiciip'
CHARSET='utf8'


ITEM_PIPELINES = {
   'xiciip.pipelines.XiciipPipeline': 300,
}

4.pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import MySQLdb

####注意
from scrapy.conf import settings


class XiciipPipeline(object):
    def process_item(self, item, spider):

        # DBKWARGS=spider.settings.get('DBKWARGS')
        # con=MySQLdb.connect(**DBKWARGS)


        host = settings['MYSQL_HOSTS']
        user = settings['MYSQL_USER']
        psd = settings['MYSQL_PASSWORD']
        db = settings['MYSQL_DB']
        c=settings['CHARSET']
#使用的方法2.
        con = MySQLdb.connect(host=host,user=user,passwd=psd,db=db,charset=c)
        #可以使用的方法1
        #con = MySQLdb.connect(host='127.0.0.1',user='root',passwd='******',db='xiciip',charset='utf8')
        cur=con.cursor()
        sql=("insert into proxy(ip,port,position,type,speed,last_check_time) "
             "values(%s,%s,%s,%s,%s,%s)")
    #    sql=('insert into p1(url) values("%s")')
        #sql="insert into p1 values (%s)"
        #list=(item['url'].split(':')[0])
        #list=[item['url']]
        #print('wwwwwwwwwwwwwwww',list,type(list),type('h'))
        list=[item['ip'],item['port'],item['position'],item['type'],item['speed'],item['last_check_time']]

        try:
            cur.execute(sql,list)
        except Exception,e:
            print('Insert error',e)
            con.rollback()

        else:
            con.commit()

        cur.close()
        con.close()

        return item
要使用Scrapy爬取图片并将其储到MySQL中,需要遵循以下步骤: 1. 在Scrapy项目中创建一个MySQL数据库连接。(可以使用PyMySQL库) 2. 创建一个Item类来储图像链接和图像标题等信息。 3. 在spider中使用XPath或CSS选择器来提取图像链接和标题等信息,并通过Item将其传递给pipelines。 4. 在pipelines中,使用requests库下载图像并将其储到本地文件系统中。 5. 然后使用Python的MySQL库将图像路径和其他相关信息插入到MySQL数据库中。 以下是一个简单的示例代码: ```python import scrapy from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem import pymysql from PIL import Image class MySQLPipeline(object): def __init__(self, db_host, db_port, db_user, db_password, db_name): self.db_host = db_host self.db_port = db_port self.db_user = db_user self.db_password = db_password self.db_name = db_name self.conn = None self.cursor = None @classmethod def from_crawler(cls, crawler): return cls( db_host=crawler.settings.get('DB_HOST'), db_port=crawler.settings.get('DB_PORT'), db_user=crawler.settings.get('DB_USER'), db_password=crawler.settings.get('DB_PASSWORD'), db_name=crawler.settings.get('DB_NAME'), ) def open_spider(self, spider): self.conn = pymysql.connect( host=self.db_host, port=self.db_port, user=self.db_user, password=self.db_password, db=self.db_name, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) self.cursor = self.conn.cursor() def close_spider(self, spider): self.conn.close() def process_item(self, item, spider): try: # 将图片下载到本地 image_path = item['image_urls'][0] image_title = item['title'] image_extension = image_path.split('.')[-1] image_name = f'{image_title}.{image_extension}' image_path = image_path.replace('thumb180', 'large') image_request = scrapy.Request(image_path) image_response = scrapy.utils.python.get_val_from_func( image_request, 'response', spider=spider ) image_content = image_response.body image = Image.open(BytesIO(image_content)) image.save(f'{image_name}', quality=95) # 将图片信息插入到数据库中 sql = "INSERT INTO images (title, path) VALUES (%s, %s)" self.cursor.execute(sql, (image_title, image_name)) self.conn.commit() except Exception as e: print(e) raise DropItem(f"Error processing item: {item['image_urls']}") ``` 在Scrapy的settings.py文件中需要添加以下配置: ```python ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline': 1, 'myproject.pipelines.MySQLPipeline': 2, } IMAGES_STORE = '/path/to/your/images/folder' DB_HOST = 'localhost' DB_PORT = 3306 DB_USER = 'root' DB_PASSWORD = 'password' DB_NAME = 'database_name' ``` 在spider中需要使用ImageItem来储图像链接和标题等信息: ```python from scrapy import Spider from myproject.items import ImageItem class MySpider(Spider): name = 'myspider' start_urls = ['http://example.com'] def parse(self, response): image_link = response.css('img::attr(src)').extract_first() image_title = response.css('img::attr(alt)').extract_first() item = ImageItem() item['image_urls'] = [image_link] item['title'] = image_title yield item ``` 这样,当爬虫运行时,它将下载图像并将其信息插入到MySQL数据库中。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值