from methods import*from c_config import*if __name__ =="__main__":#generate the augmented data sets# for size_folder in size_folders:
dataset_folders =['../data/train/'+ s for s in datasets]#for each datasetfor dataset_folder in dataset_folders:
train_orig = dataset_folder +'/train.txt'#for each n_aug valuefor num_aug in num_aug_list:
output_file = dataset_folder +'/train_'+str(num_aug)+'.txt'#generate the augmented dataif num_aug >4and'4_full/pc'in train_orig:
gen_standard_aug(train_orig, output_file, num_aug=4)else:
gen_standard_aug(train_orig, output_file, num_aug=num_aug)#generate the vocab dictionary
word2vec_pickle = dataset_folder +'/word2vec.p'
gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)
模型训练代码:c_2_train_eval.py修改为如下:
from c_config import*from methods import*from numpy.random import seed
import sys
import os
import tensorflow as tf
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'# sd = int(sys.argv[1])# random.seed(sd)################################### run model and get acc ###################################defrun_model(train_file, test_file, num_classes, percent_dataset):#initialize model
model = build_cnn(input_size, word2vec_len, num_classes)#load data
train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec,1)#implement early stopping
callbacks =[EarlyStopping(monitor='val_loss', patience=3)]#train model
model.fit( train_x,
train_y,
epochs=100000,
callbacks=callbacks,
validation_split=0.1,
batch_size=64,
shuffle=True,
verbose=0)#model.save('checkpoints/lol')#model = load_model('checkpoints/lol')#evaluate model
y_pred = model.predict(test_x)
test_y_cat = one_hot_to_categorical(test_y)
y_pred_cat = one_hot_to_categorical(y_pred)
acc = accuracy_score(test_y_cat, y_pred_cat)#clean memory???
train_x, train_y =None,None
gc.collect()#return the accuracy#print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)return acc
########################################### main ############################################if __name__ =="__main__":
writer =open('./outputs_f3/'+ get_now_str()+'.txt','w')#for each size dataset# for size_folder in size_folders:# writer.write(size_folder + '\n')#get all six datasets
dataset_folders =['../data/train/'+ s for s in datasets]#for storing the performances
performances ={num_aug:[]for num_aug in num_aug_list}#for each datasetfor i inrange(len(dataset_folders)):print('dataset_folders: ', i)#initialize all the variables
dataset_folder = dataset_folders[i]
dataset = datasets[i]
num_classes = num_classes_list[i]
input_size = input_size_list[i]
word2vec_pickle = dataset_folder +'/word2vec.p'
word2vec = load_pickle(word2vec_pickle)#test each num_aug valuefor num_aug in num_aug_list:# train_path = dataset_folder + '/train_' + str(num_aug) + '.txt'
train_path = dataset_folder +'/train.txt'
test_path ='../data/test/'+ dataset +'/test.txt'
acc = run_model(train_path, test_path, num_classes, percent_dataset=1)
performances[num_aug].append(acc)
writer.write(train_path +','+str(acc)+'\n')
writer.write(str(performances)+'\n')for num_aug in performances:
line =str(num_aug)+' : '+str(sum(performances[num_aug])/len(performances[num_aug]))
writer.write(line +'\n')print(line)print(performances)
writer.close()
d数据处理的代码:d_0_preprocess.py
from methods import*defgenerate_short(input_file, output_file, alpha):
lines =open(input_file,'r').readlines()
increment =int(len(lines)/alpha)
lines = lines[::increment]
writer =open(output_file,'w')for line in lines:
writer.write(line)if __name__ =="__main__":#global params
huge_word2vec ='word2vec/glove.840B.300d.txt'
datasets =['pc']#, 'trec']for dataset in datasets:
dataset_folder ='../data/train/'+ dataset
test_short ='../data/test/'+ dataset +'/test.txt'
test_aug_short = dataset_folder +'/test.txt'
word2vec_pickle = dataset_folder +'/word2vec.p'#augment the data# gen_tsne_aug(test_short, test_aug_short)#generate the vocab dictionaries
gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)
训练的代码d_1_train_models.py
from methods import*from numpy.random import seed
seed(0)################################### run model and get acc ###################################defrun_model(train_file, test_file, num_classes, model_output_path):#initialize model
model = build_model(input_size, word2vec_len, num_classes)#load data
train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec,1)
test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec,1)#implement early stopping
callbacks =[EarlyStopping(monitor='val_loss', patience=3)]#train model
model.fit( train_x,
train_y,
epochs=100000,
callbacks=callbacks,
validation_split=0.1,
batch_size=1024,
shuffle=True,
verbose=0)#save the model
model.save(model_output_path)#model = load_model('checkpoints/lol')#evaluate model
y_pred = model.predict(test_x)
test_y_cat = one_hot_to_categorical(test_y)
y_pred_cat = one_hot_to_categorical(y_pred)
acc = accuracy_score(test_y_cat, y_pred_cat)#clean memory???
train_x, train_y =None,None#return the accuracy#print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)return acc
if __name__ =="__main__":#parameters
dataset_folders =['../data/train/pc/']#['increment_datasets_f2/trec', 'increment_datasets_f2/pc']
output_paths =['outputs_f4/pc_aug.h5']#['outputs_f4/trec_aug.h5', 'outputs_f4/pc_aug.h5']
num_classes_list =[2]# [6, 2]
input_size_list =[25]# [25, 25]#word2vec dictionary
word2vec_len =300for i, dataset_folder inenumerate(dataset_folders):
num_classes = num_classes_list[i]
input_size = input_size_list[i]
output_path = output_paths[i]
train_orig = dataset_folder +'/train.txt'
test_path ='../data/test/pc/test.txt'
word2vec_pickle = dataset_folder +'word2vec.p'
word2vec = load_pickle(word2vec_pickle)#train model and save
acc = run_model(train_orig, test_path, num_classes, output_path)print(dataset_folder, acc)