scrapy的管道设计

最新推荐文章于 2023-02-01 07:49:50 发布
qh0526wy
最新推荐文章于 2023-02-01 07:49:50 发布
阅读量503
点赞数
分类专栏： Python
本文链接：https://blog.csdn.net/qh0526wy/article/details/122275055
版权
python 开发语言后端
Python 专栏收录该内容
51 篇文章 8 订阅
订阅专栏
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#同一个scrapy爬虫必须是存入同一数据库的，否则将不支持  阙辉
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import os
import QH_Sctock_001_Scrapy.qh_py_file.qh_sqlite_ORM as qh_db_orm  #引入excel配置
import QH_Sctock_001_Scrapy.qh_py_file.qh_get_excel_settings as qh_ges  #引入excel配置
import QH_Sctock_001_Scrapy.qh_py_file.qh_spider_tool as qh_tool  #引入工具

class QhSctock001ScrapyPipeline:
    def open_spider(self, spider):
    #初始化存数列表
        self.qh_oo = []
    #爬虫启动时将爬虫id先置空
        self.qh_spider_id = ""
        self.qh_spider_id_bool = ""
    def process_item(self, item, spider):
    #相当于是启动爬虫时的初始化
        if self.qh_spider_id == "" and self.qh_spider_id_bool == "":
            self.qh_spider_id = item["qh_spider_id"]
            self.qh_spider_id_bool = self.qh_spider_id   #1 第一次赋值
            # qh_oo0 = item["qh_new_fied"]
            # self.qh_oo.append(qh_oo0)
        # 获取数据库配置参数
            qh_path = os.path.abspath(os.path.join(os.getcwd(), "."))       # 运行文件所在路径
            qh_db_lujing = qh_ges.QH_Get_DB_CanShu(qh_path, self.qh_spider_id_bool)
            # out
            self.qh_db_path = qh_db_lujing["QH_DB_Path"]              # 1.获取数据库路径
            self.qh_db_name = qh_db_lujing["QH_DB_Name"]              # 2.获取数据库名称
            self.qh_table_name = qh_db_lujing["QH_Table_Name"]        # 3.获取数据库表名称
            self.qh_filed_set = qh_db_lujing["qh_Filed_Set"]          # 4.获取数据库表字段的设置列表
            self.qh_filed_value = qh_db_lujing["qh_Filed_Value"]      # 5.获取数据库表字段的名称
            self.qh_id_type = qh_db_lujing["QH_Id_Type"]              # 6.获取数据库表字段的ID的拼接方法类型（新）
            self.qh_id_type_filed = qh_db_lujing["QH_Id_Type_Filed"]  # 7.获取数据库表字段的ID的拼接方法具体值（新）
        # 数据库连接（实例化，数据库存在则连接，不存在则先创建再连接）
            self.qh_cunshu_db = qh_db_orm.QH_SQLite_DB_CunChu(self.qh_db_path)

        if self.qh_spider_id != "" and self.qh_spider_id_bool != "":
            self.qh_spider_id = item["qh_spider_id"]
            if self.qh_spider_id_bool == self.qh_spider_id:
                qh_oo0 = item["qh_new_fied"]
                self.qh_oo.append(qh_oo0)
            elif self.qh_spider_id_bool != self.qh_spider_id:
            #爬虫id变化，表示是新的爬虫
            #将上一爬虫爬取的数据存入数据库并保存
                Qh_Spider_Id = str(self.qh_spider_id_bool)
                Qh_CunShu_Data = self.qh_oo
                self.qh_cunshu_db.QH_DT_ChuanCan(Qh_Table_Name = self.qh_table_name,
                                                 QH_Table_Set = self.qh_filed_set,
                                                 qh_filed_key = self.qh_filed_value,
                                                 qh_spider_id = Qh_Spider_Id,
                                                 Qh_ShuJu_Data = Qh_CunShu_Data,
                                                 Qh_id_lie_c = self.qh_id_type_filed,
                                                 Qh_ID_Type = self.qh_id_type)
                self.qh_cunshu_db.QH_CunChu_Data_Main()
                self.qh_cunshu_db.QH_Cimmit()
            #清空上一爬虫的参数并重新初始化
                self.qh_oo = []
                self.qh_spider_id_bool = self.qh_spider_id  #2 第二次赋值  N
                qh_oo0 = item["qh_new_fied"]
                self.qh_oo.append(qh_oo0)
                # 获取数据库配置参数
                qh_path = os.path.abspath(os.path.join(os.getcwd(), "."))  # 运行文件所在路径
                qh_db_lujing = qh_ges.QH_Get_DB_CanShu(qh_path, self.qh_spider_id_bool)
                # out
                self.qh_db_path = qh_db_lujing["QH_DB_Path"]              # 1.获取数据库路径
                self.qh_db_name = qh_db_lujing["QH_DB_Name"]              # 2.获取数据库名称
                self.qh_table_name = qh_db_lujing["QH_Table_Name"]        # 3.获取数据库表名称
                self.qh_filed_set = qh_db_lujing["qh_Filed_Set"]          # 4.获取数据库表字段的设置列表
                self.qh_filed_value = qh_db_lujing["qh_Filed_Value"]      # 5.获取数据库表字段的名称
                self.qh_id_type = qh_db_lujing["QH_Id_Type"]              # 6.获取数据库表字段的ID的拼接方法类型（新）
                self.qh_id_type_filed = qh_db_lujing["QH_Id_Type_Filed"]  # 7.获取数据库表字段的ID的拼接方法具体值（新）
        return item

    def close_spider(self,spider):

       Qh_Spider_Id = str(self.qh_spider_id_bool)
       Qh_CunShu_Data = self.qh_oo
       self.qh_cunshu_db.QH_DT_ChuanCan(Qh_Table_Name=self.qh_table_name,
                                        QH_Table_Set=self.qh_filed_set,
                                        qh_filed_key=self.qh_filed_value,
                                        qh_spider_id=Qh_Spider_Id,
                                        Qh_ShuJu_Data=Qh_CunShu_Data,
                                        Qh_id_lie_c=self.qh_id_type_filed,
                                        Qh_ID_Type=self.qh_id_type)
       self.qh_cunshu_db.QH_CunChu_Data_Main()
       self.qh_cunshu_db.QH_Cimmit()
       self.qh_cunshu_db.QH_DB_Close()