python h5文件的读写

这篇文章是一个工具类,用来辅助医学图像分割实战 unet实现(二) 4、数据存储 这一小节的内容。

2019/5/2 更新:HDF5DatasetWrite可以动态扩展储存大小

文件: HDF5DatasetGenerator.py

# -*- coding: utf-8 -*-
import h5py
import os
import numpy as np

class HDF5DatasetGenerator:
    
    def __init__(self, dbPath, batchSize, preprocessors=None,
                 aug=None, binarize=True, classes=2):
        self.batchSize = batchSize
        self.preprocessors = preprocessors
        self.aug = aug
        self.binarize = binarize
        self.classes = classes
        
        self.db = h5py.File(dbPath)
        self.numImages = self.db["images"].shape[0]
#        self.numImages = total
        print("total images:",self.numImages)
        self.num_batches_per_epoch = int((self.numImages-1)/batchSize) + 1
        
    
    def generator(self, shuffle=True, passes=np.inf):
        epochs = 0
        
        while epochs < passes:
            shuffle_indices = np.arange(self.numImages) 
            shuffle_indices = np.random.permutation(shuffle_indices)
            for batch_num in range(self.num_batches_per_epoch):
                
                start_index = batch_num * self.batchSize
                end_index = min((batch_num + 1) * self.batchSize, self.numImages)
                
                # h5py get item by index,参数为list,而且必须是增序
                batch_indices = sorted(list(shuffle_indices[start_index:end_index]))
                
                images = self.db["images"][batch_indices,:,:,:]
                labels = self.db["masks"][batch_indices,:,:,:]
                
#                if self.binarize:
#                    labels = np_utils.to_categorical(labels, self.classes)
                
                if self.preprocessors is not None:
                    procImages = []
                    for image in images:
                        for p in self.preprocessors:
                            image = p.preprocess(image)
                        procImages.append(image)
                    
                    images = np.array(procImages)
                
                if self.aug is not None:
                    # 不知道意义何在?本身images就有batchsize个了
                    (images, labels) = next(self.aug.flow(images, labels,
                                                        batch_size=self.batchSize))
                yield (images, labels)
            
            epochs += 1
            
    def close(self):
        self.db.close()

文件: HDF5DatasetWriter.py

# -*- coding: utf-8 -*-
import h5py
import os

class HDF5DatasetWriter:
    def __init__(self, image_dims, mask_dims, outputPath, bufSize=200):
        """
        Args:
        - bufSize: 当内存储存了bufSize个数据时,就需要flush到外存
        """
        if os.path.exists(outputPath):
            raise ValueError("The supplied 'outputPath' already"
                             "exists and cannot be overwritten. Manually delete"
                             "the file before continuing", outputPath)
        
        self.db = h5py.File(outputPath, "w")
        self.data = self.db.create_dataset("images", image_dims, maxshape=(None,)+image_dims[1:], dtype="float")
        self.masks = self.db.create_dataset("masks", mask_dims, maxshape=(None,)+mask_dims[1:], dtype="int")
        self.dims = image_dims
        self.bufSize = bufSize
        self.buffer = {"data": [], "masks": []}
        self.idx = 0
    

    def add(self, rows, masks):
        # extend() 函数用于在列表末尾一次性追加另一个序列中的多个值(用新列表扩展原来的列表)
        # 注意,用extend还有好处,添加的数据不会是之前list的引用!!
        self.buffer["data"].extend(rows)
        self.buffer["masks"].extend(masks)
        print("len ",len(self.buffer["data"]))
        
        if len(self.buffer["data"]) >= self.bufSize:
            self.flush()
    
    def flush(self):
        i = self.idx + len(self.buffer["data"])
        if i>self.data.shape[0]:
        	# 扩展大小的策略可以自定义
            new_shape = (self.data.shape[0]*2,)+self.dims[1:]
            print("resize to new_shape:",new_shape)
            self.data.resize(new_shape)
            self.masks.resize(new_shape)
        self.data[self.idx:i,:,:,:] = self.buffer["data"]
        self.masks[self.idx:i,:,:,:] = self.buffer["masks"]
        print("h5py have writen %d data"%i)
        self.idx = i
        self.buffer = {"data": [], "masks": []}
        
    
    def close(self):
        if len(self.buffer["data"]) > 0:
            self.flush()
  • 3
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值