scrapy---将数据json化保存---mongo数据库---mysql数据库---excel里--下载图片(分类)----下载文件

最新推荐文章于 2022-07-11 07:36:00 发布

smalljun520

最新推荐文章于 2022-07-11 07:36:00 发布

阅读量296

点赞数

分类专栏：爬虫知识点

本文链接：https://blog.csdn.net/weixin_42312791/article/details/80948797

版权

爬虫知识点专栏收录该内容

6 篇文章 0 订阅

订阅专栏

json

方法一:通过在cmd中输命令来实现

scrapy crawl novel -o novel.json -s FEED_EXPORT_ENCIDING=UTF-8

novel为爬虫名称(name)

scrapy将数据保存为csv文件: scrapy crawl novel -o novel.csv -s FEED_EXPORT_ENCIDING=UTF-8

方法二:通过pipelines实现

1.自定义自己的pipeline

pipelines.py


import json


class JsonWriterPipeline(object):

    def __init__(self):
        self.file = open('jobbole.json', 'wb')

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line.encode('utf-8'))
        return item

2.在settings中开启自己的pipeline

settings.py

ITEM_PIPELINES = {
   # 'Jobbole.pipelines.JobbolePipeline': 300,
    #  Json数据保存
    'Jobbole.pipelines.JsonWriterPipeline':1,
}

Mongodb

1.在pipelines.py中自定义自己的pipeline


import pymongo

# 保存到Mongo数据库里面
class MongoPipeline(object):
    def __init__(self, client, db):
        self.client = pymongo.MongoClient(client)
        self.db = self.client[db]

    @classmethod
    def from_crawler(cls, crawler):
        obj = cls(
            client=crawler.settings.get('MONGOCLIENT', 'localhost'),
            db=crawler.settings.get('DB', 'jobbole')
        )
        return obj

    def process_item(self, item, spider):
        self.db['jobbole'].update_one({'artitle_item': item['artitle_item']}, {'$set': dict(item)}, True)
        return item

2.在settings.py中开启自己的pipeline

ITEM_PIPELINES = {
    # 保存到Mongo数据库中
    'Jobbole.pipelines.MongoPipeline':1,
}

  MONGOCLIENT = 'localhost'

DB = 'jobbole'

Mysql

1.在pipelines.py中自定义自己的pipeline


import pymysql

class DBPipeline(object):
    def __init__(self, host, port, db, user, passwd, charset):
        self.db = pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset=charset)
        self.cursor = self.db.cursor()

    @classmethod
    def from_crawler(cls, crawler):
        # 连接数据库
        obj = cls(
            host=crawler.settings.get('MYSQL_HOST', 'localhost'),
            port=3306,
            db=crawler.settings.get('MYSQL_DBNAME', 'jobbole'),
            user=crawler.settings.get('MYSQL_USER', 'root'),
            passwd=crawler.settings.get('MYSQL_PASSWD', '123456'),
            charset='utf8')
        return obj

    def process_item(self, item, spider):
        # 插入数据
        try:
            self.cursor.execute(
                """insert into jobbole (artitle_item, release_time, fenlei ,dianzan, num,comment, content)
                value (%s, %s, %s, %s, %s, %s, %s)""", (item['artitle_item'], item['release_time'], item['fenlei'], item['dianzan'],
                 item['num'], item['comment'], item['content']))

            # 提交sql语句
            self.db.commit()
            return item

        except Exception as e:
            print(e)
            self.db.rollback()

2.在settings中开启自己的pipeline

ITEM_PIPELINES = {
    # 保存到Mysql数据库中
    'Jobbole.pipelines.DBPipeline':1,
}

MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'jobbole'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'

excel

1.在pipelines.py中自定义自己的pipeline


from openpyxl import Workbook


# 将数据保存到excel中

class ExcelPipeline(object):  # 设置工序一

    wb = Workbook() # class实例化
    ws = wb.active # 激活工作表
    ws.append(['文章标题', '发布时间', '分类', '点赞次数', '评论次数', '内容'])  # 设置表头添加一行数据

    def process_item(self, item, spider):  # 工序具体内容
        line = [item['artitle_item'], item['release_time'], item['fenlei'], item['dianzan'], item['num'], item['comment'],item['content']]  # 把数据中每一项整理出来
        self.ws.append(line)  # 将数据以行的形式添加到xlsx中
        self.wb.save('jobbole.xlsx')  # 保存xlsx文件
        return item

ITEM_PIPELINES = {
    # 保存到Excel中
    'Jobbole.pipelines.ExcelPipeline':1,
}

下载图片(分类)

1.在settings.py里面进行配置

ITEM_PIPELINES = {

   'scrapy.pipelines.images.ImagesPipeline': None,#  scrapy默认的
   'chinaz.pipelines.ChinazPipeline': 1,
}
IMAGES_STORE = 'images'  # 存储图片的文件夹位置
IMAGES_URLS_FIELD = 'image_urls'# 指定下载字段

2.在 pipelines.py里面进行下载



from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request


class ChinazPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        # 从item中获取要下载图片的url，根据url构造Request()对象，并返回该对象
        image_urls = item['image_urls']
        for image_url in image_urls:
            yield Request(image_url, meta={'item': item, 'img': image_url})

    def file_path(self, request, response=None, info=None):
        # 用来自定义图片的下载路径
        item = request.meta['item']
        names = item['names']
        url = request.meta['img'].split('/')[-1]
        path = names+'/'+url
        return path

    def item_completed(self, results, item, info):
        # 图片下载完成后，返回的结果results
        print(results)
        return item

下载文件



1.qishu.py

# 需要下载的文件地址，需要是一个列表
# 如果不下载，只是将地址保存在数据库中，不需要设置列表
qishu['download_url'] = [download_url]



2.在pipelines.py中自定义自己的pipeline

from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request


class QishuxiazaiPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        image_url = item['download_url'][0]
        yield Request(image_url, meta={'item': item})

    def file_path(self, request, response=None, info=None):
        item = request.meta['item']
        novel_name = item['download_url'][0].split('/')[-1]
        return '%s' % novel_name

    def item_completed(self, results, item, info):
        print(results)
        return item



3.在settings.py中开启自己的pipeline

ITEM_PIPELINES = {
    'Qishu.pipelines.QishuxiazaiPipeline': 3,
    # 如果采用自定义的CustomImagesPipeline，需要将自带的ImagesPipeline设置为None。
    'scrapy.pipelines.files.FilesPipeline': None
}
FILES_STORE = 'files'
FILES_URLS_FIELD = 'download_url'