scrapy-csv存储常见问题及解决方案
在学习scarpy的时候碰到一些存储数据的时候使用csv文件时出现的问题,这边做一个记录
scrapy输出csv文件数据多空行
查看源码scrapy.exporters.CsvItemExporter,在io.TextIOWrapper加入参数newline=’’
class CsvItemExporter(BaseItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
self._configure(kwargs, dont_fail=True)
if not self.encoding:
self.encoding = 'utf-8'
self.include_headers_line = include_headers_line
self.stream = io.TextIOWrapper(
file,
newline='',
line_buffering=False,
write_through=True,
encoding=self.encoding
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
self._headers_not_written = True
scrapy数据存储至csv,指定字段排序的问题
方案1、继承重写CsvItemExporter,指定写入的顺序
class CSVPipeline(CsvItemExporter):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
savefile = open('douban_top250_export.csv', 'wb+')
self.files[spider] = savefile
self.exporter = CsvItemExporter(savefile)
self.exporter.fields_to_export = [
'rank',
'score',
'title_CN',
'title_EN',
'url',
'detail',
]
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
savefile = self.files.pop(spider)
savefile.close()
def process_item(self, item, spider):
print(type(item))
self.exporter.export_item(item)
return item
方案2
1、在scrapy的spiders同层目录,新建my_project_csv_item_exporter.py文件
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter
class MyProjectCsvItemExporter(CsvItemExporter):
def __init__(self, *args, **kwargs):
delimiter = settings.get('CSV_DELIMITER', ',')
kwargs['delimiter'] = delimiter
fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
if fields_to_export :
kwargs['fields_to_export'] = fields_to_export
super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
2、在同层目录,settings.py文件新增如下内容(指定item,field顺序)
FEED_EXPORTERS = {
'csv': 'jsuser.spiders.csv_item_exporter.MyProjectCsvItemExporter',
} #jsuser为工程名
FIELDS_TO_EXPORT = [
'author',
'title',
'url',
'reads',
'comments',
'likes',
'rewards'
]