scrapy保存数据到excel:利用openpyxl创建多张表,设置Excel行数限制

在处理excel数据时发现了xlwt的局限性–不能写入超过65535行、256列的数据(因为它只支持Excel 2003及之前的版本,在这些版本的Excel中行数和列数有此限制)如果数据量超过65535就会遇到:ValueError: row index was 65536, not allowed by .xls format,openpyx支持07/10/13版本Excel的,最大行数达到1048576。 

 使用Excel导出数据,即时保存数据是一个问题,每插入一条数据,先加载,然后保存,操作次数频繁,影响爬虫效率,但是scrapy中途停止时,数据损失小,如果每一张表,数据达到限定的数据量保存,相比前一种方法,爬虫运行效率高,但是需要停止爬虫时,最后一张表的数据没有达到最大数据量时,整个表的数据会丢失,虽然使用了关闭爬虫是保存文件,或异常时保存文件,但是没有效果,后续有时间再继续探索

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
from scrapy import Item
from .items import QuestionItem, AnswerItem
from openpyxl.workbook import Workbook
from openpyxl import load_workbook
import os
import random
import string


class BaiduZhidaoPipeline(object):
    def process_item(self, item, spider):
        return item

class MongoDBPipeline(object):
    def open_spider(self, spider):
        db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
        db_name = spider.settings.get('MONGODB_DB_NAME', 'baiduzhidao')
        self.db_client = MongoClient(db_uri)
        self.db = self.db_client[db_name]


    def close_spider(self, spider):
        self.db_client.close()

    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    def insert_db(self, item):
        if isinstance(item, Item):
            if isinstance(item, QuestionItem):
                item = dict(item)
                self.db['baiduzhidao_question'].insert_one(item)
            elif isinstance(item, AnswerItem):
                item = dict(item)
                self.db['baiduzhidao_answer'].insert_one(item)


class ExcelPipeline(object):
    def open_spider(self, spider):
        ##设置Excel数据条数
        self.item_num_max = 65535
        self.item_num = 0
        self.file = ''


    def close_spider(self, spider):
        print("Excel数据保存结束--------------------------------------------------")
        self.wb.save(self.file)

    def process_item(self, item, spider):
        try:
            if self.item_num == 0:
                self.create_excel()
            elif self.item_num > self.item_num_max:
                ## 数据大于最大行数,保存Excel,重新创建
                self.wb.save(self.file)
                self.create_excel()
                self.item_num = 0
            # self.load_sheet()
            if isinstance(item, Item):
                if isinstance(item, QuestionItem):
                    item = dict(item)
                    print("Pipline-----------------------", item)
                    data = [item["question_id"], item["question"], item["question_all"], item["url"], " ", " ", " ",
                            " ", " ", " ", " ", item["create_time"]]
                elif isinstance(item, AnswerItem):
                    item = dict(item)
                    print("Pipline-----------------------", item)
                    data = [item["question_id"], " ", " ", " ", item["answer"], item["evaluate_num"],
                            item["keyword_nums"], item["best_answer"], item["username"], item["published_time"],
                            item["answer_id"], item["create_time"]]
                self.ws.append(['%s' % i for i in data])
                self.item_num += 1
                print(self.item_num,"数据导出--------------------------------------------------------------------")
            # self.wb.save(self.file)
            return item
        except:
            print("Excel异常结束-------------------------------------------")
            self.ws.save(self.file)
            return item

    def create_excel(self):
        self.file = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + ".xlsx"
        item = [u'问题id', u'问题', '答案个数', u'链接', u'答案排序', u'答案', u'点赞数', u'关键词出现次数', u'是否是最佳答案', u'用户名称', u'发布时间', u'答案id', u'创建时间']
        self.wb = Workbook(write_only=True)
        self.ws = self.wb.create_sheet()
        self.ws.append(['%s' % i for i in item])
        # self.wb.save(self.file)


    def load_sheet(self):
        self.wb = load_workbook(self.file)
        self.ws = self.wb['Sheet']








创建一张表,创建多个工作簿

 

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
from scrapy import Item
from .items import QuestionItem, AnswerItem
from openpyxl.workbook import Workbook
from openpyxl import load_workbook
import os
import random
import string


class BaiduZhidaoPipeline(object):
    def process_item(self, item, spider):
        return item

class MongoDBPipeline(object):
    def open_spider(self, spider):
        db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
        db_name = spider.settings.get('MONGODB_DB_NAME', 'baiduzhidao')
        self.db_client = MongoClient(db_uri)
        self.db = self.db_client[db_name]


    def close_spider(self, spider):
        self.db_client.close()

    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    def insert_db(self, item):
        if isinstance(item, Item):
            if isinstance(item, QuestionItem):
                item = dict(item)
                self.db['baiduzhidao_question'].insert_one(item)
            elif isinstance(item, AnswerItem):
                item = dict(item)
                self.db['baiduzhidao_answer'].insert_one(item)


class ExcelPipeline(object):
    def open_spider(self, spider):
        ##设置Excel数据条数
        self.item_num_max = 20
        self.item_num = 0
        self.file = ''


    def close_spider(self, spider):
        print("数据保存结束--------------------------------------------------")
        self.wb.save(self.file)

    def process_item(self, item, spider):
        try:
            if self.item_num == 0:
                self.create_excel()
            elif self.item_num > self.item_num_max:
                ## 数据大于最大行数,保存Excel,重新创建
                self.wb.save(self.file)
                self.create_excel()
                self.item_num = 0
            # self.load_sheet()
            if isinstance(item, Item):
                if isinstance(item, QuestionItem):
                    item = dict(item)
                    print("Pipline-----------------------", item)
                    data = [item["question_id"], item["question"], item["question_all"], item["url"], " ", " ", " ",
                            " ", " ", " ", " ", item["create_time"]]
                elif isinstance(item, AnswerItem):
                    item = dict(item)
                    print("Pipline-----------------------", item)
                    data = [item["question_id"], " ", " ", " ", item["answer"], item["evaluate_num"],
                            item["keyword_nums"], item["best_answer"], item["username"], item["published_time"],
                            item["answer_id"], item["create_time"]]
                self.ws.append(['%s' % i for i in data])
                self.item_num += 1
                print(self.item_num,"数据导出--------------------------------------------------------------------")
            # self.wb.save(self.file)
            return item
        except:
            print("结束111111111111111")
            self.ws.save(self.file)
            return item

    def create_excel(self):
        self.file = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + ".xlsx"
        item = [u'问题id', u'问题', '答案个数', u'链接', u'答案排序', u'答案', u'点赞数', u'关键词出现次数', u'是否是最佳答案', u'用户名称', u'发布时间', u'答案id', u'创建时间']
        self.wb = Workbook(write_only=True)
        self.ws = self.wb.create_sheet()
        self.ws.append(['%s' % i for i in item])
        # self.wb.save(self.file)


    def load_sheet(self):
        self.wb = load_workbook(self.file)
        self.ws = self.wb['Sheet']








 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值