scrapy使用item,pipeline爬取数据，并且上传图片到oss

最新推荐文章于 2023-11-23 21:21:57 发布

比利时署球

最新推荐文章于 2023-11-23 21:21:57 发布

阅读量2.6k

点赞数 1

分类专栏： Scrapy 文章标签： scrapy item pipeline oss

本文链接：https://blog.csdn.net/qq_28654919/article/details/85089039

版权

本文介绍了使用Scrapy爬虫框架进行数据抓取，并通过自定义pipeline将图片上传到OSS的过程。文章详细讲解了如何声明item对象，编写爬虫脚本，以及实现IwatchImgUploadPipeline和IwatchDataMysqlPipeline两个管道，分别处理图片上传和数据入库。同时，还涉及到redis去重和日志处理等关键步骤。

摘要由CSDN通过智能技术生成

Scrapy原理图

scrapy 原理图
首先是要理解scrapy的运作原理是什么。这里不做解释，网上有很多资源可以理解。

声明item对象

明确了需要爬取哪些字段，就现在items.py里面定义字段

import scrapy

class IwatchImgItem(scrapy.Item):
    """
    下载项目图片的item定义
    依次对应：图片链接、图片对象、图片保存地址,插入之后返回的id
    """
    image_urls = scrapy.Field()
    images = scrapy.Field()
    image_paths = scrapy.Field()
    id = scrapy.Field()

class IwatchBrandItem(IwatchImgItem):
    """
    品牌的参数定义
    依次对应：品牌中文名字、品牌英文名字、品牌链接、品牌图片链接、品牌商标图片链接、品牌介绍
    """
    brand_chname= scrapy.Field()
    brand_enname = scrapy.Field()
    brand_url = scrapy.Field()
    brand_img_url = scrapy.Field()
    brand_trademark_url = scrapy.Field()
    brand_introduction = scrapy.Field()


class IwatchSerieItem(IwatchImgItem):
    """
    品牌之系列参数定义
    依次对应：品牌名字、系列名字、系列链接、系列介绍、系列图片链接
    """
    brand_chname = scrapy.Field()
    serie_name = scrapy.Field()
    serie_url = scrapy.Field()
    serie_introduction = scrapy.Field()
    serie_img_url = scrapy.Field()


class IwatchwatchItem(IwatchImgItem):
    """
    系列之商品手表参数定义
    依次对应：手表名字、手表链接、手表图片链接、手表美元价格、手表港元价格、手表欧元价格
    手表基本信息、手表机芯、手表外观、手表功能
    """
    iwatch_name = scrapy.Field()
    iwatch_url = scrapy.Field()
    iwatch_img_url = scrapy.Field()
    iwatch_price_yuan = scrapy.Field()
    iwatch_price_hkd = scrapy.Field()
    iwatch_price_euro = scrapy.Field()
    iwatch_base = scrapy.Field()
    iwatch_mechanism = scrapy.Field()
    iwatch_exterior = scrapy.Field()
    iwatch_fuction = scrapy.Field()

我这里有4个item对象，其中IwatchBrandItem，IwatchSerieItem，IwatchwatchItem都继承了IwatchImgItem，因为这三个item里面都需要下载项目的图片到oss上面去。

编写爬虫脚本

import scrapy
from spider_iwatch.items import IwatchBrandItem, IwatchSerieItem, IwatchwatchItem

class IwatchSpider(scrapy.Spider):
    name = 'iwatch'  # 爬虫名，这是独一无二的
    allowed_domains = ['']  这里放允许爬取网址的域名
    start_urls = ['']  # 这里放最开始爬取的网址，我保密删除了

    def parse(self, response):
        div_list = response.xpath(".//div[@class='item_one']")
        for div in div_list:
            li_list = div.xpath(".//div[@class='list_main']//li")
            for li in li_list:
            	# 省略部分解析内容
                brandItem = IwatchBrandItem()
                follow_serie_url = li.xpath(".//div[@class='item_btn']/a/@href").extract_first()
                yield scrapy.Request(follow_serie_url, callback=self.parse_serie, meta={
   'brand_chname': brand_chname}, priority=30
                yield brandItem

    def parse_serie(self, response):
    	# 省略部分解析内容
        a_list = response.xpath(".//div[@class='c_wc_list']//a")
        for a in a_list:
            serie_url = a.xpath("./@href").extract_first().replace(".html", "_1.html")
            yield scrapy.Request(serie_url, callback=self.parse_iwatch_page_first
            yield serieitem

    def parse_iwatch_page_first(self, response):

        li_list = response.xpath(".//div[@class='w_class_list']//li")
        for li in li_list:
            iwatch_url = li.xpath("./a/@href").extract_first()
            yield scrapy.Request(iwatch_url, callback=self.parse_iwatch)

        try:
            page = response.xpath(".//div[@class='manu']/a/text()")[-2].extract()
        except:
            page = 1
        for i in range(1, int(page)):
            next_link = response.url.replace("_1.html", "_{}.html".format(str(i + 1)))
            yield scrapy.Request(next_link, callback=self.parse_iwatch_page_next)

    def parse_iwatch_page_next(self, response):
        li_list = response.xpath(".//div[@class='w_class_list']//li")
        for li in li_list:
            iwatch_url = li.xpath("./a/@href").extract_first()
            yield scrapy.Request(iwatch_url, callback=self.parse_iwatch)

    def parse_iwatch(self, response):
        watchitem = IwatchwatchItem()
        # 解析内容省略
        yield watchitem

pipeline管道处理数据

pipeline.py代码

import hashlib, logging
import os, oss2, requests

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from spider_iwatch.items import IwatchBrandItem, IwatchSerieItem, IwatchwatchItem
from scrapy.exceptions import DropItem, NotConfigured
from functools import reduce
from spider_iwatch.common.iwatch_sql import DBHelper
from spider_iwatch.common import config

from spider_iwatch.utils.redis_filter.request_filter import RequestFilter


class IwatchDataMysqlPipeline(object):

    def __init__(self):
        self.rf = RequestFilter()

    def process_item(self, item, spider):

        # 保存brand信息
        if isinstance(item, IwatchBrandItem):
            brand_id = DBHelper().get_brand_id(item['brand_chname'])
            if brand_id == False:
                image_paths = reduce(lambda x, y: x + ';' + y, item['image_paths'</