存储到mysql中
首先我们需要在mysql中创建好对应的数据库以及表
之后在Pipeline中创建好类
import pymysql
class mysqlPipeLine(object):
# 数据库连接
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='指定数据库', charset='utf8')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute(执行sql语句)
self.conn.commit()
print('成功插入', 某某, '的工作信息到数据库中!')
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()
存储照片
我们在Pipeline中定义如下类
from scrapy.pipelines.images import ImagesPipeline
class imgsPipeLine(ImagesPipeline):
# 根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['img_src'])
# 指定图片存储的路径
def file_path(self, request, response=None, info=None, *, item):
imgName = item['img_name']
return imgName
def item_completed(self, results, item, info):
return item # 返回给下一个即将被执行的管道类
在setting中添加如下信息
# 指定图片存储的目录
IMAGES_STORE = './img_lib'
存储为csv文件类型
class CSVPipeline(object):
def open_spider(self, spider):
print('开始存储!')
self.f = open('./fileName.csv', mode='w', encoding='utf-8')
def close_spider(self, spider):
print('存储完毕!')
if self.f:
self.f.close()
def process_item(self, item, spider):
self.f.write(f"{item['xxx']},{(item['xx'])},{item['xx']},...,\n")
return item
存储为txt文件
class TXTPipeline(object):
fp = None
# 重写父类的一个方法:该方法只在开始爬虫的时候被调用一次
def open_spider(self, spider):
print('开始爬虫!')
self.fp = open('./fileName.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
xx = item['xx']
xxx = item['xxx']
# 持久化存储
self.fp.write(xx + ':' + xxx + '\n')
return item # 就会传递给下一个即将被执行的管道类
# 重写父类
def close_spider(self, spider):
print('结束爬虫!')
self.fp.close()