在处理excel数据时发现了xlwt的局限性–不能写入超过65535行、256列的数据(因为它只支持Excel 2003及之前的版本,在这些版本的Excel中行数和列数有此限制)如果数据量超过65535就会遇到:ValueError: row index was 65536, not allowed by .xls format,openpyx支持07/10/13版本Excel的,最大行数达到1048576。
使用Excel导出数据,即时保存数据是一个问题,每插入一条数据,先加载,然后保存,操作次数频繁,影响爬虫效率,但是scrapy中途停止时,数据损失小,如果每一张表,数据达到限定的数据量保存,相比前一种方法,爬虫运行效率高,但是需要停止爬虫时,最后一张表的数据没有达到最大数据量时,整个表的数据会丢失,虽然使用了关闭爬虫是保存文件,或异常时保存文件,但是没有效果,后续有时间再继续探索
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
from scrapy import Item
from .items import QuestionItem, AnswerItem
from openpyxl.workbook import Workbook
from openpyxl import load_workbook
import os
import random
import string
class BaiduZhidaoPipeline(object):
def process_item(self, item, spider):
return item
class MongoDBPipeline(object):
def open_spider(self, spider):
db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
db_name = spider.settings.get('MONGODB_DB_NAME', 'baiduzhidao')
self.db_client = MongoClient(db_uri)
self.db = self.db_client[db_name]
def close_spider(self, spider):
self.db_client.close()
def process_item(self, item, spider):
self.insert_db(item)
return item
def insert_db(self, item):
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
self.db['baiduzhidao_question'].insert_one(item)
elif isinstance(item, AnswerItem):
item = dict(item)
self.db['baiduzhidao_answer'].insert_one(item)
class ExcelPipeline(object):
def open_spider(self, spider):
##设置Excel数据条数
self.item_num_max = 65535
self.item_num = 0
self.file = ''
def close_spider(self, spider):
print("Excel数据保存结束--------------------------------------------------")
self.wb.save(self.file)
def process_item(self, item, spider):
try:
if self.item_num == 0:
self.create_excel()
elif self.item_num > self.item_num_max:
## 数据大于最大行数,保存Excel,重新创建
self.wb.save(self.file)
self.create_excel()
self.item_num = 0
# self.load_sheet()
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], item["question"], item["question_all"], item["url"], " ", " ", " ",
" ", " ", " ", " ", item["create_time"]]
elif isinstance(item, AnswerItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], " ", " ", " ", item["answer"], item["evaluate_num"],
item["keyword_nums"], item["best_answer"], item["username"], item["published_time"],
item["answer_id"], item["create_time"]]
self.ws.append(['%s' % i for i in data])
self.item_num += 1
print(self.item_num,"数据导出--------------------------------------------------------------------")
# self.wb.save(self.file)
return item
except:
print("Excel异常结束-------------------------------------------")
self.ws.save(self.file)
return item
def create_excel(self):
self.file = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + ".xlsx"
item = [u'问题id', u'问题', '答案个数', u'链接', u'答案排序', u'答案', u'点赞数', u'关键词出现次数', u'是否是最佳答案', u'用户名称', u'发布时间', u'答案id', u'创建时间']
self.wb = Workbook(write_only=True)
self.ws = self.wb.create_sheet()
self.ws.append(['%s' % i for i in item])
# self.wb.save(self.file)
def load_sheet(self):
self.wb = load_workbook(self.file)
self.ws = self.wb['Sheet']
创建一张表,创建多个工作簿
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
from scrapy import Item
from .items import QuestionItem, AnswerItem
from openpyxl.workbook import Workbook
from openpyxl import load_workbook
import os
import random
import string
class BaiduZhidaoPipeline(object):
def process_item(self, item, spider):
return item
class MongoDBPipeline(object):
def open_spider(self, spider):
db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
db_name = spider.settings.get('MONGODB_DB_NAME', 'baiduzhidao')
self.db_client = MongoClient(db_uri)
self.db = self.db_client[db_name]
def close_spider(self, spider):
self.db_client.close()
def process_item(self, item, spider):
self.insert_db(item)
return item
def insert_db(self, item):
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
self.db['baiduzhidao_question'].insert_one(item)
elif isinstance(item, AnswerItem):
item = dict(item)
self.db['baiduzhidao_answer'].insert_one(item)
class ExcelPipeline(object):
def open_spider(self, spider):
##设置Excel数据条数
self.item_num_max = 20
self.item_num = 0
self.file = ''
def close_spider(self, spider):
print("数据保存结束--------------------------------------------------")
self.wb.save(self.file)
def process_item(self, item, spider):
try:
if self.item_num == 0:
self.create_excel()
elif self.item_num > self.item_num_max:
## 数据大于最大行数,保存Excel,重新创建
self.wb.save(self.file)
self.create_excel()
self.item_num = 0
# self.load_sheet()
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], item["question"], item["question_all"], item["url"], " ", " ", " ",
" ", " ", " ", " ", item["create_time"]]
elif isinstance(item, AnswerItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], " ", " ", " ", item["answer"], item["evaluate_num"],
item["keyword_nums"], item["best_answer"], item["username"], item["published_time"],
item["answer_id"], item["create_time"]]
self.ws.append(['%s' % i for i in data])
self.item_num += 1
print(self.item_num,"数据导出--------------------------------------------------------------------")
# self.wb.save(self.file)
return item
except:
print("结束111111111111111")
self.ws.save(self.file)
return item
def create_excel(self):
self.file = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + ".xlsx"
item = [u'问题id', u'问题', '答案个数', u'链接', u'答案排序', u'答案', u'点赞数', u'关键词出现次数', u'是否是最佳答案', u'用户名称', u'发布时间', u'答案id', u'创建时间']
self.wb = Workbook(write_only=True)
self.ws = self.wb.create_sheet()
self.ws.append(['%s' % i for i in item])
# self.wb.save(self.file)
def load_sheet(self):
self.wb = load_workbook(self.file)
self.ws = self.wb['Sheet']