scrapy百度POI爬虫实战项目代码(三)

piplelines.py -------------------------------
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import sqlite3
from copy import deepcopy
from twisted.enterprise import adbapi
class MapbarspiderPipeline:
    def __init__(self, sqlite_file, sqlite_table):
        self.sqlite_file = sqlite_file
        self.sqlite_table = sqlite_table
        self.dbpool = adbapi.ConnectionPool('sqlite3',
                                            database=self.sqlite_file,
                                            check_same_thread=False)
        self.conn=object()
        self.cur=object()
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            sqlite_file = crawler.settings.get('SQLITE_DB_NAME'), # 从 settings.py 提取
            sqlite_table = crawler.settings.get('SQLITE_TABLE', 'items')
        )

def open_spider(self, spider):
    self.conn = sqlite3.connect(self.sqlite_file)
    #self.cur = self.conn.cursor()
    try:
        create_tb_cmd='''
            CREATE TABLE IF NOT EXISTS mapbar
            (city_code varchar(255),
             city_name varchar(255),
             root_sort varchar(255),
             item_sort varchar(255),
             item_href varchar(255),
             uid varchar(255),
             area_name varchar(255),
             address_name varchar(255),
             address_addr varchar(255),
             address_lon double,
             address_lat double
            );
        '''
        #主要就是上面的语句
        self.conn.execute(create_tb_cmd)
        self.conn.close()
    except:
        print("Create table failed")


def close_spider(self, spider):
    #self.conn.close()
    if len(spider.resp_items_list) > 0:
        bulkdata = deepcopy(spider.resp_items_list)
        self.dbpool.runInteraction(self.bulk_insert_to_sqlite, spider.itemkeys, bulkdata)
        # self.bulk_insert_to_sqlite(spider.itemkeys, spider.pgs)
        # 清空缓冲区
        del spider.resp_items_list[:]

def process_item(self, item, spider):
    if len(spider.resp_items_list) == 1000:
        bulkdata = deepcopy(spider.resp_items_list)
        self.dbpool.runInteraction(self.bulk_insert_to_sqlite, spider.itemkeys, bulkdata)
        # 清空缓冲区
        del spider.resp_items_list[:]

    """ insert_sql = "insert into {0}({1}) values ({2})".format(self.sqlite_table, 
                                                            ','.join(item_deepcopy.keys()),
                                                            ','.join(['?'] * len(item_deepcopy.keys())))
    #print(item.fields.values())
    mytuple=tuple(item_deepcopy.values())                                                        
    self.cur.execute(insert_sql, mytuple)
    self.conn.commit() """
    return item

def bulk_insert_to_sqlite(self, tx, itemkeys, bulkdata):
    try:
        insert_sql = "insert into {0}({1}) values ({2})".format(self.sqlite_table,
                                                                ', '.join(itemkeys),
                                                                ', '.join(['?'] * len(itemkeys)))
        tx.executemany(insert_sql, bulkdata)
    except sqlite3.Error as why:
        print(why.args[0])
        #logging.info(why.args[0])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值