json
方法一:通过在cmd中输命令来实现
scrapy crawl novel -o novel.json -s FEED_EXPORT_ENCIDING=UTF-8
novel为爬虫名称(name)
scrapy将数据保存为csv文件: scrapy crawl novel -o novel.csv -s FEED_EXPORT_ENCIDING=UTF-8
方法二:通过pipelines实现
1.自定义自己的pipeline
pipelines.py
import json class JsonWriterPipeline(object): def __init__(self): self.file = open('jobbole.json', 'wb') def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line.encode('utf-8')) return item
2.在settings中开启自己的pipeline
settings.py
ITEM_PIPELINES = { # 'Jobbole.pipelines.JobbolePipeline': 300, # Json数据保存 'Jobbole.pipelines.JsonWriterPipeline':1, }
Mongodb
1.在pipelines.py中自定义自己的pipeline
import pymongo # 保存到Mongo数据库里面 class MongoPipeline(object): def __init__(self, client, db): self.client = pymongo.MongoClient(client) self.db = self.client[db] @classmethod def from_crawler(cls, crawler): obj = cls( client=crawler.settings.get('MONGOCLIENT', 'localhost'), db=crawler.settings.get('DB', 'jobbole') ) return obj def process_item(self, item, spider): self.db['jobbole'].update_one({'artitle_item': item['artitle_item']}, {'$set': dict(item)}, True) return item
2.在settings.py中开启自己的pipeline
ITEM_PIPELINES = { # 保存到Mongo数据库中 'Jobbole.pipelines.MongoPipeline':1, } MONGOCLIENT = 'localhost'
DB = 'jobbole'
Mysql
1.在pipelines.py中自定义自己的pipeline
import pymysql class DBPipeline(object): def __init__(self, host, port, db, user, passwd, charset): self.db = pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset=charset) self.cursor = self.db.cursor() @classmethod def from_crawler(cls, crawler): # 连接数据库 obj = cls( host=crawler.settings.get('MYSQL_HOST', 'localhost'), port=3306, db=crawler.settings.get('MYSQL_DBNAME', 'jobbole'), user=crawler.settings.get('MYSQL_USER', 'root'), passwd=crawler.settings.get('MYSQL_PASSWD', '123456'), charset='utf8') return obj def process_item(self, item, spider): # 插入数据 try: self.cursor.execute( """insert into jobbole (artitle_item, release_time, fenlei ,dianzan, num,comment, content) value (%s, %s, %s, %s, %s, %s, %s)""", (item['artitle_item'], item['release_time'], item['fenlei'], item['dianzan'], item['num'], item['comment'], item['content'])) # 提交sql语句 self.db.commit() return item except Exception as e: print(e) self.db.rollback()
2.在settings中开启自己的pipeline
ITEM_PIPELINES = { # 保存到Mysql数据库中 'Jobbole.pipelines.DBPipeline':1, }
MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'jobbole' MYSQL_USER = 'root' MYSQL_PASSWD = '123456'
excel
1.在pipelines.py中自定义自己的pipeline
from openpyxl import Workbook
# 将数据保存到excel中 class ExcelPipeline(object): # 设置工序一 wb = Workbook() # class实例化 ws = wb.active # 激活工作表 ws.append(['文章标题', '发布时间', '分类', '点赞次数', '评论次数', '内容']) # 设置表头添加一行数据 def process_item(self, item, spider): # 工序具体内容 line = [item['artitle_item'], item['release_time'], item['fenlei'], item['dianzan'], item['num'], item['comment'],item['content']] # 把数据中每一项整理出来 self.ws.append(line) # 将数据以行的形式添加到xlsx中 self.wb.save('jobbole.xlsx') # 保存xlsx文件 return item
ITEM_PIPELINES = { # 保存到Excel中 'Jobbole.pipelines.ExcelPipeline':1, }下载图片(分类)
1.在settings.py里面进行配置
ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline': None,# scrapy默认的 'chinaz.pipelines.ChinazPipeline': 1, } IMAGES_STORE = 'images' # 存储图片的文件夹位置 IMAGES_URLS_FIELD = 'image_urls'# 指定下载字段
2.在 pipelines.py里面进行下载
from scrapy.pipelines.images import ImagesPipeline from scrapy.http import Request class ChinazPipeline(ImagesPipeline): def get_media_requests(self, item, info): # 从item中获取要下载图片的url,根据url构造Request()对象,并返回该对象 image_urls = item['image_urls'] for image_url in image_urls: yield Request(image_url, meta={'item': item, 'img': image_url}) def file_path(self, request, response=None, info=None): # 用来自定义图片的下载路径 item = request.meta['item'] names = item['names'] url = request.meta['img'].split('/')[-1] path = names+'/'+url return path def item_completed(self, results, item, info): # 图片下载完成后,返回的结果results print(results) return item下载文件
1.qishu.py # 需要下载的文件地址,需要是一个列表 # 如果不下载,只是将地址保存在数据库中,不需要设置列表 qishu['download_url'] = [download_url] 2.在pipelines.py中自定义自己的pipeline from scrapy.pipelines.files import FilesPipeline from scrapy.http import Request class QishuxiazaiPipeline(FilesPipeline): def get_media_requests(self, item, info): image_url = item['download_url'][0] yield Request(image_url, meta={'item': item}) def file_path(self, request, response=None, info=None): item = request.meta['item'] novel_name = item['download_url'][0].split('/')[-1] return '%s' % novel_name def item_completed(self, results, item, info): print(results) return item 3.在settings.py中开启自己的pipeline ITEM_PIPELINES = { 'Qishu.pipelines.QishuxiazaiPipeline': 3, # 如果采用自定义的CustomImagesPipeline,需要将自带的ImagesPipeline设置为None。 'scrapy.pipelines.files.FilesPipeline': None } FILES_STORE = 'files' FILES_URLS_FIELD = 'download_url'