[批量导入数据]https://blog.csdn.net/amao1998/article/details/81236900
def create_dataset(batch_size,name_list,label_list):
create_data = tf.data.Dataset.from_generator(file_readline, (tf.float32, tf.int32),
(tf.TensorShape([18,4096]), tf.TensorShape([18,10])),
args=(name_list,label_list))###取数据
create_data = create_data.batch(batch_size)
iterator = tf.data.Iterator.from_structure(create_data.output_types,
create_data.output_shapes)###函数迭代器
training_init_op = iterator.make_initializer(create_data)
return iterator,training_init_op
学习链接的知识,更改的读取方式
def file_readline(name_list,label_list):
lines1=[]
for label_list,name_list in zip(label_list,name_list):
with open(label_list,'r') as op1:
for la in op1.readlines():
lines1 = str(la).replace("\n","").split(",")
label=[]
label.append([int(lines1[i])for i in range(len(lines1))])
NUM_CLASSES = 10
label=np.array(label[0])
num_labels = label.shape[0]
index_offset = np.arange(num_labels) * NUM_CLASSES
labels_onehot = np.zeros((num_labels,NUM_CLASSES))
labels_onehot.flat[index_offset + label.ravel()] = 1
with open(name_list,'r') as op2:
lg=op2.readlines()
c=len(lg)
while c:
lg[len(lg)-c] = str(lg[len(lg)-c]).replace("\n","").split(" ")
c=c-1
lg=np.array(lg,dtype=np.float32).T
lg=lg.reshape(18,4096)
labels_onehot=labels_onehot.reshape(18,10)
yield (lg, labels_onehot)
成行读取方式:
def read_name_list(path):
name_list = []
label_list= []
for child_dir in os.listdir(path):
child_dir1=os.path.join(path,child_dir)
for child_list in os.listdir(child_dir1):
child_list1=os.path.join(child_dir1,child_list)
for child_list2 in os.listdir(child_list1):
child_list3=os.path.join(child_list1,child_list2)
b=[]
for child_list4 in os.listdir(child_list3):
b.append(child_list4)
if len(b)==2:
label_list.append(os.path.join(child_list3,b[0]))
name_list.append(os.path.join(child_list3,b[1]))
return name_list,label_list
def file_readline(num_start,num_end):
name_lists=[]
label_lists=[]
lines1=[]
path=r'/home/cwx/anaconda3/envs/GTK_Files/Labels_every_single_patch_0924'
name_lists,label_lists=read_name_list(path)
for label_list,name_list in zip(label_lists[num_start:num_end],name_lists[num_start:num_end]):
with open(label_list,'r') as op1:
for la in op1.readlines():
lines1 = str(la).replace("\n","").split(",")
with open(name_list,'r') as op2:
index=0
lg=op2.readlines()
d=18
while d:
d=d-1
list1 = []
c=len(lg)
while c:
a = str(lg[len(lg)-c]).replace("\n","").split(" ")
b = a[index:index+1] # 这是选取需要读取的位数
list1.append(b) # 将其添加在列表之中
c=c-1
label=int(lines1[index])
data =np.array(list1).reshape(1,64,64,1)
label1=tf.one_hot(indices=[label],depth=10,axis=1,dtype=tf.int32)
sess=tf.Session()
label1=sess.run(label1)
sess.close()
yield (data, label1)
index =index+1
if index == 18:
index = 0
def create_train_dataset(batch_size):
data = tf.data.Dataset.from_generator(file_readline, (tf.float32, tf.int32),
(tf.TensorShape([1,64,64,1]), tf.TensorShape([1,10])),
args=(0,954))
data = data.batch(batch_size)
train_data = data.make_one_shot_iterator()
#data=data.make_initializer()
return train_data
成文本读取方式:
label_data_dir = ('/home/cwx/anaconda3/envs/GTK_Files/Labels_every_single_patch_0924/')
#label_data_dir = ('/home/a204/cwx/3D data/Neuron Testing Images/Testing Label_Bigneuron/')
#label_data_dir = ('/media/a204/E0D490E80EC13EE0/cwx/labels_3d_block/')
point_class_name = ('Neuron Termination/','Neuron Branch/','Neuron Non_Critical/','Neuron Cross/')
z=pd.read_csv(label_data_dir + 'train_data_shuffled_MouseBrain.csv',header=None)
child_list=z[1].values
name_list = []
label_list= []
for child_list1 in child_list:
b=[]
for child_list2 in os.listdir(child_list1):
b.append(child_list2)
if len(b)==2:
label_list.append(os.path.join(child_list1,b[0]))
name_list.append(os.path.join(child_list1,b[1]))
data_dict={'name_list':name_list,'label_list':label_list}
data_dict_df = pd.DataFrame(data_dict)
cut_idx0 = int(0.8 * data_dict_df.shape[0])
cut_idx1 = int(0.9 * data_dict_df.shape[0])
df_train,df_development,df_test = data_dict_df.iloc[:cut_idx0], data_dict_df.iloc[cut_idx0:cut_idx1],data_dict_df.iloc[cut_idx1:]#845,106,106
len_train=len(df_train)
len_development=len(df_development)
len_test=len(df_test)
df_train_name_list=list(df_train['name_list'])
df_train_label_list=list(df_train['label_list'])
df_development_name_list=list(df_development['name_list'])
df_development_label_list=list(df_development['label_list'])
df_test_name_list=list(df_test['name_list'])
df_test_label_list=list(df_test['label_list'])
def file_readline(name_list,label_list):
lines1=[]
for label_list,name_list in zip(label_list,name_list):
with open(label_list,'r') as op1:
for la in op1.readlines():
lines1 = str(la).replace("\n","").split(",")
label=[]
label.append([int(lines1[i])for i in range(len(lines1))])
NUM_CLASSES = 10
label=np.array(label[0])
num_labels = label.shape[0]
index_offset = np.arange(num_labels) * NUM_CLASSES
labels_onehot = np.zeros((num_labels,NUM_CLASSES))
labels_onehot.flat[index_offset + label.ravel()] = 1
with open(name_list,'r') as op2:
lg=op2.readlines()
c=len(lg)
while c:
lg[len(lg)-c] = str(lg[len(lg)-c]).replace("\n","").split(" ")
c=c-1
lg=np.array(lg,dtype=np.float32).T
lg=lg.reshape(18,4096)
labels_onehot=labels_onehot.reshape(18,10)
yield (lg, labels_onehot)
复杂文本转csv方式:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import numpy as np
import tensorflow as tf
import pandas as pd
label_data_dir = ('/home/cwx/anaconda3/envs/GTK_Files/Labels_every_single_patch_0924/')
#label_data_dir = ('/home/a204/cwx/3D data/Neuron Testing Images/Testing Label_Bigneuron/')
#label_data_dir = ('/media/a204/E0D490E80EC13EE0/cwx/labels_3d_block/')
point_class_name = ('Neuron Termination/','Neuron Branch/','Neuron Non_Critical/','Neuron Cross/')
label_ind = list([2.0, 0.0, 1.0, 3.0])
label_len = []
for jj in range(len(point_class_name)):
all_data_dir = label_data_dir + point_class_name[jj]
train_path_list=os.listdir(all_data_dir)
#for kk in range(len(train_path_list)):
# img_name = train_path_list[kk]
# if img_name == 'suplement':
# tmp_path = all_data_dir + 'suplement/'
# del train_path_list[kk]
# break
#train_path_list2 = os.listdir(tmp_path)
#for jj in train_path_list2:
# train_path_list.append('suplement/' + jj)
train_data_paths = []
train_data_paths2 = []
train_data_paths_temp=[]
train_label=[]
labels = label_ind[jj]
for case in train_path_list:
train_data_paths.append(os.path.join(all_data_dir,case))
for case1 in train_data_paths:
train_data_list2=os.listdir(case1)
for case2 in train_data_list2:
train_data_paths_temp.append(os.path.join(case1,case2))
train_label.append(labels) #train: 0.0=branch(501), 1.0=non_critical(1724), 2.0=termination(505), 3.0=cross(300) total=3030
#test: 0.0=branch(331), 1.0=non_critical(261), 2.0=termination(381), 3.0=cross(138) total=1111
df = pd.DataFrame(train_label)
df.rename(columns={0: 'label'}, inplace = True)
df['data path']=train_data_paths_temp
df.to_csv(label_data_dir + 'train_data_origin_MouseBrain.csv', mode='a',header=None, index=None)
label_len.append(len(train_label))
z=pd.read_csv(label_data_dir + 'train_data_origin_MouseBrain.csv',header=None)
z=z.sample(frac=1)
z.to_csv(label_data_dir + 'train_data_shuffled_MouseBrain.csv',header=None, index=None)
生成器方式
注意事项:
1.变量初始化要注意位置,要放在变量的后面
2.复杂文本结果,可用pandas和csv解决
3.正常代码不会占用太多内存,即内存不会上升,若爆满,请注意是否是循环时使用tf类似的函数,会不停的生成节点,产生无用数据