保存文件信息到本地并用json形式在pipelines.py中操作用代码实现

最新推荐文章于 2021-05-24 17:31:42 发布

原创最新推荐文章于 2021-05-24 17:31:42 发布 · 472 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #scrapy

本文介绍了使用Python爬虫抓取小说数据，并将其存储到JSON文件及SQLite数据库的方法。涉及内容包括：通过不同方式打开文件以避免乱码、将爬取的数据格式化为JSON字符串保存、处理文件的末尾多余字符以及实现自定义的数据管道用于存储图片。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >


import codecs
import os
import json

class HongxiuPipeline(object):
    def __init__(self):
        self.file = codecs.open(filename='book.json',mode='w',encoding='utf-8')
        self.file.write('"book_list":[')
    def process_item(self, item, spider):
        res = dict(item)
        str = json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write(',\n')

        return item

    def close_spider(self,spider):
        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()

#用来打开指定文件 并且对文件进行转码 防止出现乱码问题
import codecs
import json
import os

class XiaoshuoPipeline(object):
    def __init__(self):

        #w 写文件  w+  读写文件  r+   读写文件
        #前者读写文件 如果文件不存在 则创建
        # 后者读写文件 如果不存在 抛出异常
        self.file = codecs.open(filename='book.json',mode='w+',encoding = 'utf-8')
        self.file.write('"list":[')
    # 如果想要将数据写入本地 或者使用数据库的时候 这个方法需要保留
    def process_item(self, item, spider):
        # print(item)
        print('--------------------------------')
        #将item对象 转化为字典对象
        res = dict(item)
        # print(res)
        #dumps将字典对象转化成字符串 ascii编码是否可用
        # 如果直接将字典形式的数据写入到文件中 会出错
        # 所以 需要将字典形式的值，转化成字符串写入到文件夹中
        str = json.dumps(res,ensure_ascii=False)
        print(str)
        print('-----------------------------------')
        self.file.write(str)
        self.file.write(',\n')

    def open_spider(self,spider):
        print('爬虫开始了')

    def close_spider(self,spider):
        print('爬虫结束了')
        #删除文件当中最后一个字符
        #-1 表示偏移量
        # SEEK_END 定位到文件的最后一个字符
        self.file.seek(-1,os.SEEK_END)
        #开始执行
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()

3.有些情况下保存信息到本地时需要两层文件夹（文件夹内部有文件夹）就需要对第二层文件夹进行设置

import scrapy

# ImagesPipeline系统中下载图片的管道
from scrapy.pipelines.images import ImagesPipeline
# 系统管道有下载图片的功能 我们的管道继承了系统通道 也有了下载图片的功能
class ZhanzhangPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        #这个方法会循环执行 ，前面每次传入一个人item
        # 这个item被交给了引擎，引擎又交给了管道来执行，管道里面有很多个方法
        # 这些方法会一次指定
        # print('管道执行')
        # print(item['title'])
        # print(item['img'])
        yield scrapy.Request(url = item['img'][0],meta={'item':item})
        # 管道里面提供了一系列的内置方法，这些会放在自动从第一个执行到最后一个
    def file_path(self,request,response=None,info = None):
        print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++')
        item=request.meta['item']
        print(item['title'])
        print(item['img'])
        #设置图签的路径为  类名名称/url地址
        img_name = item['img'][0].split('/')[-1]
        # 在拼接图签名字的时候 注意 /和\
        path = '%s/%s'%(item['title'],img_name)
                        #（）内 前者为第一层文件夹名 后者为第一层文件夹下文件夹名
        return path

4.在存储到本地时也可以将信息存入sqlite3 数据库中，同样在管道文件pipelines.py中操作

#存入数据库要先引入数据库
import sqlite3
class HongixuPipeline(object):
    def process_item(self, item, spider):
        return item
class HongXiuDBPipeline(object):
    def open_spider(self,spider):
           #创建数据库
        self.connect = sqlite3.connect('hongxiuDB')
          #创建光标
        self.cursor = self.connect.cursor()
         #利用光标执行操作，建出数据表
        self.cursor.execute('create table if not exists bookTable(name text,author next,img next,intro next)')
        self.connect.commit()

    def process_item(self,item,spider):
         #往数据库中插入数据并执行（数据库操作一定不能忘记执行 commit）
        self.cursor.execute('insert into bookTable(name,author,intro,img)VALUES ("{}","{}","{}","{}")'.format(item['name'],item['author'],item['intro'],item['img']))
        self.connect.commit()

    def close_spider(self,spider):
        #关闭时 先关闭光标再关闭数据库
        self.cursor.close()
        self.connect.close()