cnn的相关代码如下所示

c部分的代码

数据处理:c_1_data_process.py文件的代码修改为如下:

from methods import *
from c_config import *

if __name__ == "__main__":

	#generate the augmented data sets

	# for size_folder in size_folders:

	dataset_folders = ['../data/train/' + s for s in datasets]

	#for each dataset
	for dataset_folder in dataset_folders:
		train_orig = dataset_folder + '/train.txt'

		#for each n_aug value
		for num_aug in num_aug_list:

			output_file = dataset_folder + '/train_' + str(num_aug) + '.txt'

			#generate the augmented data
			if num_aug > 4 and '4_full/pc' in train_orig:
				gen_standard_aug(train_orig, output_file, num_aug=4)
			else:
				gen_standard_aug(train_orig, output_file, num_aug=num_aug)

		#generate the vocab dictionary
		word2vec_pickle = dataset_folder + '/word2vec.p'
		gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)

模型训练代码:c_2_train_eval.py修改为如下:

from c_config import * 
from methods import *
from numpy.random import seed


import sys
import os
import tensorflow as tf
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# sd = int(sys.argv[1])
# random.seed(sd)

###############################
#### run model and get acc ####
###############################

def run_model(train_file, test_file, num_classes, percent_dataset):

	#initialize model
	model = build_cnn(input_size, word2vec_len, num_classes)

	#load data
	train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
	test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

	#implement early stopping
	callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

	#train model
	model.fit(	train_x, 
				train_y, 
				epochs=100000, 
				callbacks=callbacks,
				validation_split=0.1, 
				batch_size=64, 
				shuffle=True, 
				verbose=0)
	#model.save('checkpoints/lol')
	#model = load_model('checkpoints/lol')

	#evaluate model
	y_pred = model.predict(test_x)
	test_y_cat = one_hot_to_categorical(test_y)
	y_pred_cat = one_hot_to_categorical(y_pred)
	acc = accuracy_score(test_y_cat, y_pred_cat)

	#clean memory???
	train_x, train_y = None, None
	gc.collect()

	#return the accuracy
	#print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
	return acc

###############################
############ main #############
###############################

if __name__ == "__main__":
	
	writer = open('./outputs_f3/' + get_now_str() + '.txt', 'w')

	#for each size dataset
	# for size_folder in size_folders:

	# 	writer.write(size_folder + '\n')

	#get all six datasets
	dataset_folders = ['../data/train/' + s for s in datasets]

	#for storing the performances
	performances = {num_aug:[] for num_aug in num_aug_list}
	#for each dataset
	for i in range(len(dataset_folders)):
		print('dataset_folders: ', i)
		#initialize all the variables
		dataset_folder = dataset_folders[i]
		dataset = datasets[i]
		num_classes = num_classes_list[i]
		input_size = input_size_list[i]
		word2vec_pickle = dataset_folder + '/word2vec.p'
		word2vec = load_pickle(word2vec_pickle)

		#test each num_aug value
		for num_aug in num_aug_list:
			# train_path = dataset_folder + '/train_' + str(num_aug) + '.txt'

			train_path = dataset_folder + '/train.txt'
			test_path = '../data/test/' + dataset + '/test.txt'
			acc = run_model(train_path, test_path, num_classes, percent_dataset=1)
			performances[num_aug].append(acc)
			writer.write(train_path + ',' + str(acc) + '\n')

	writer.write(str(performances) + '\n')

	for num_aug in performances:
		line = str(num_aug) + ' : ' + str(sum(performances[num_aug])/len(performances[num_aug]))
		writer.write(line + '\n')
		print(line)
	print(performances)

	writer.close()

d数据处理的代码:d_0_preprocess.py

from methods import *

def generate_short(input_file, output_file, alpha):
	lines = open(input_file, 'r').readlines()
	increment = int(len(lines)/alpha)
	lines = lines[::increment]
	writer = open(output_file, 'w')
	for line in lines:
		writer.write(line)

if __name__ == "__main__":

	#global params
	huge_word2vec = 'word2vec/glove.840B.300d.txt'
	datasets = ['pc']#, 'trec']

	for dataset in datasets:

		dataset_folder = '../data/train/' + dataset
		test_short = '../data/test/' + dataset + '/test.txt'
		test_aug_short = dataset_folder + '/test.txt'
		word2vec_pickle = dataset_folder + '/word2vec.p' 

		#augment the data
		# gen_tsne_aug(test_short, test_aug_short)

		#generate the vocab dictionaries
		gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)

训练的代码d_1_train_models.py


from methods import *
from numpy.random import seed
seed(0)

###############################
#### run model and get acc ####
###############################

def run_model(train_file, test_file, num_classes, model_output_path):

	#initialize model
	model = build_model(input_size, word2vec_len, num_classes)

	#load data
	train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, 1)
	test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

	#implement early stopping
	callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

	#train model
	model.fit(	train_x, 
				train_y, 
				epochs=100000, 
				callbacks=callbacks,
				validation_split=0.1, 
				batch_size=1024, 
				shuffle=True, 
				verbose=0)

	#save the model
	model.save(model_output_path)
	#model = load_model('checkpoints/lol')

	#evaluate model
	y_pred = model.predict(test_x)
	test_y_cat = one_hot_to_categorical(test_y)
	y_pred_cat = one_hot_to_categorical(y_pred)
	acc = accuracy_score(test_y_cat, y_pred_cat)

	#clean memory???
	train_x, train_y = None, None

	#return the accuracy
	#print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
	return acc

if __name__ == "__main__":

	#parameters
	dataset_folders = ['../data/train/pc/'] #['increment_datasets_f2/trec', 'increment_datasets_f2/pc']
	output_paths = ['outputs_f4/pc_aug.h5'] #['outputs_f4/trec_aug.h5', 'outputs_f4/pc_aug.h5']
	num_classes_list = [2] # [6, 2]
	input_size_list = [25] # [25, 25]

	#word2vec dictionary
	word2vec_len = 300

	for i, dataset_folder in enumerate(dataset_folders):

		num_classes = num_classes_list[i]
		input_size = input_size_list[i]
		output_path = output_paths[i]
		train_orig = dataset_folder + '/train.txt'
		test_path =  '../data/test/pc/test.txt'
		word2vec_pickle = dataset_folder + 'word2vec.p'
		word2vec = load_pickle(word2vec_pickle)

		#train model and save
		acc = run_model(train_orig, test_path, num_classes, output_path)
		print(dataset_folder, acc)
  • 7
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

一只红花猪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值