基于四系统的文本分类(TextClassify_FourSystems)

基于四系统的文本分类(TextClassify_FourSystems)

说明

本文主要是实现NLP文本分类任务,目前该系统为四系统融合模型,其中包括基于字向量char的CNN模型RCNN模型 以及 基于词向量word2vec的CNN模型RCNN模型。目前的融合规则采用权值平均法进行融合。
github:

环境

python3
tensorflow

数据

所使用的数据为网上的cnews数据,共10个类别,分为训练集,验证集和测试集。 类别如下:

体育, 财经, 房产, 家居, 教育, 科技, 时尚, 时政, 游戏, 娱乐

训练集每个类别5千数据。共5万。验证集每个类别500,测试集每个类别1000。 训练集、验证集和测试集的文件如下:
cnews.train.txt: 训练集(50000条)
cnews.val.txt: 验证集(5000条)
cnews.test.txt: 测试集(10000条)
数据链接:
链接:https://pan.baidu.com/s/1vEWbsIcBW_vgRJuYeLtj9Q
提取码:8czc

Train

下面我开放了训练部分的代码:

#coding: utf-8
import numpy as np
import tensorflow as tf
from tools import tools
import jieba
import os
import time
import logging
from model_net import TextMix_C_W_model as TM
import tensorflow.contrib.keras as kr
import config
os.environ['CUDA_VISIBLE_DEVICES']='1'

flages=tf.flags
FLAGS = flages.FLAGS
flages.DEFINE_float("learning_rate",0.001,"learning rate")
flages.DEFINE_string("stop_word","./data_pre/stop_word.txt","stop word data path")
flages.DEFINE_string("word2vec","./data_pre/word2voc_all.bin","word2voc data path")
flages.DEFINE_boolean("model_restore",False,"wheater to restore model")

flages.DEFINE_string("vocab_dir","./data_pre/cnews_vocab.txt","vocab data path")
flages.DEFINE_string("train_data","../data/cnews.train.txt","training data path")
flages.DEFINE_string("val_data","../data/cnews.val.txt","val data path")
flages.DEFINE_string("test_data","../data/cnews.test.txt","test data path")


def preprocess_data(vocab_data,word2vec,stop_word_dir,train_data,sequence_length):
	######  制作vocab 字典 ###################
	vocab_list = tools.read_file(vocab_data)
	vocabulary_word2index_char = {}  ### word :index
	for i, vocab in enumerate(vocab_list):
		vocabulary_word2index_char[vocab] = i

	######  加载 word2vec ###################
	word2vec_list = tools.read_file(word2vec)
	vocabulary_word2index_word = {}
	len_vec = 0
	for i, one_word2vec in enumerate(word2vec_list):
		if i ==0:
			pass
		else:
			word2vec_split = one_word2vec.split(' ')
			words = word2vec_split[0]
			vecs = np.array(word2vec_split[1:])
			vocabulary_word2index_word[words] = vecs
			len_vec = len(vecs)

	######## 文本编码  ##############################
	train_data_lines = tools.read_file(train_data)
	categories, cat_to_id = tools.label_dict()
	X_char = []
	Y = []
	total_data = []
	for line in train_data_lines:
		label, content = line.strip().split('\t')
		contents = [vocabulary_word2index_char.get(e, 0) for e in content]
		label = cat_to_id[label]
		X_char.append(contents)
		Y.append(label)

		content_cut = jieba.cut(content, cut_all=False)  ## 结巴分词
		content_cut_smooth = tools.data_smooth(content_cut, stop_word_dir)  ## 平滑
		vec_line = np.zeros([sequence_length, len_vec])  ## 设置一个空矩阵
		i = 0
		# print(len(content_cut_smooth))
		for word_content in content_cut_smooth:
			if word_content in vocabulary_word2index_word.keys():
				i += 1
				if i > sequence_length - 1:  ## 定长处理
					break
				vec_content = vocabulary_word2index_word[word_content]  ### 对一个词编码
				vec_content_np = np.array(vec_content, dtype=float)
				vec_line[i - 1] = vec_content_np
		total_data.append(vec_line)
	X = kr.preprocessing.sequence.pad_sequences(X_char, sequence_length)
	X_word = np.array(total_data)
	Y_all = kr.utils.to_categorical(Y, num_classes=len(cat_to_id))

	return X,X_word,Y_all

def TextMix_train():
	###########  load data  ###################
	if not os.path.exists(FLAGS.vocab_dir):
		words = tools.build_vocab(train_data=FLAGS.train_data, vocab_dir=FLAGS.vocab_dir)  ### 制作词汇表
	else:
		words = tools.read_file(FLAGS.vocab_dir)
	vocab_size = len(words)

	train_char_X, train_word_X,train_Y = preprocess_data(vocab_data=FLAGS.vocab_dir,word2vec=FLAGS.word2vec,
					stop_word_dir=FLAGS.stop_word,train_data=FLAGS.train_data,sequence_length=config.seq_length)

	val_char_X, val_word_X, val_Y = preprocess_data(vocab_data=FLAGS.vocab_dir, word2vec=FLAGS.word2vec,
														  stop_word_dir=FLAGS.stop_word, train_data=FLAGS.val_data,
														  sequence_length=config.seq_length)

	print("Data deal down!")
	###########################################

	input_x_char = tf.placeholder(tf.int32, [None, config.seq_length], name='input_x')
	input_x_word = tf.placeholder(tf.float32, [None, config.seq_length, config.word2vec_size], name='input_x')
	input_y = tf.placeholder(tf.float32, [None, config.num_classes], name='input_y')

	#########  Char Model  ####################################################
	textcnn = TM.TextCNN_Char(config, vocab_size, keep_prob=config.dropout_keep_prob)
	textrcnn = TM.TextRCNN_Char(config, vocab_size, keep_prob=config.dropout_keep_prob)
	logits_cnn = textcnn.cnn_inference(input_x_char)  ### (?,10)
	logits_rcnn = textrcnn.RCNN_inference(input_x_char)  ### (?,10)

	########  Word Model  ########################################################
	textrcnn = TM.TextRCNN_word(config, keep_prob=config.dropout_keep_prob)
	logits_RCNN = textrcnn.RCNN_inference(input_x_word)  ### (?,10)
	textcnn = TM.Text_word_CNN(config, keep_prob=config.dropout_keep_prob)
	logits_CNN = textcnn.cnn(input_x_word)  ### (?,10)

	###### 模型融合
	logits = logits_cnn + logits_rcnn + logits_RCNN + logits_CNN

	############# 计算 loss 和 acc ######################################
	loss = TM.Loss(logits=logits, label=input_y)
	acc = TM.Acc(logits=logits, labels=input_y)

	global_step = tf.Variable(0, name='global_step', trainable=False)
	#learning_rate = FLAGS.learning_rate
	learning_rate = tf.train.exponential_decay(
		learning_rate=FLAGS.learning_rate,
		global_step=global_step,
		decay_steps=2500,
		decay_rate=0.5,
		staircase=True)

	optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=loss,global_step=global_step)
	#optim = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=0.9).minimize(loss=loss,global_step=global_step)

	tensorboard_dir = 'tensorboard/textcnn_word'
	tf.summary.scalar("loss", loss)
	merged_summary = tf.summary.merge_all()
	writer = tf.summary.FileWriter(tensorboard_dir)

	saver = tf.train.Saver(max_to_keep=3) ### 保存模型
	model_save_dir = 'checkpoints/'
	train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
	model_name = 'TextMixNet_{:s}.ckpt'.format(str(train_start_time))
	model_save_path = os.path.join(model_save_dir, model_name)

	logging.basicConfig(filename='./checkpoints/' + model_name + '.log',
						format='%(asctime)s - %(pathname)s - %(levelname)s: %(message)s',
						level=logging.DEBUG, filemode='a', datefmt='%Y-%m-%d%I:%M:%S %p')

	sess_config = tf.ConfigProto(allow_soft_placement=True)
	sess_config.gpu_options.allow_growth = True
	sess = tf.Session(config=sess_config)

	with sess.as_default():
		if FLAGS.model_restore:
			weights_path = './checkpoints/TextWordRCNNnet_2019-11-06-16-07-55.ckpt-12500'
			saver.restore(sess=sess, save_path=weights_path)
			step = sess.run(tf.train.get_global_step())
			writer.add_graph(sess.graph)
			infor = '##### Restore model : ' + weights_path + '  ########'
			logging.info(infor)
			print(infor)
		else:
			step = 0
			init = tf.global_variables_initializer()
			sess.run(init)
			writer.add_graph(sess.graph)
		print('First step is:', step)
		num_batch = int((len(train_char_X) - 1) / config.batch_size) + 1  ### 总batch数
		acc_begain = 0
		for epoch in range(config.epochs):
			batch_train = tools.batch_iter_Mix(train_char_X, train_word_X,train_Y, config.batch_size)  ### 生成批次数据
			Begain_learn_rate = FLAGS.learning_rate
			for x_char_batch,x_word_batch, y_batch in batch_train:
				step += 1
				_, learn_rate, train_loss_value, train_pred, train_acc, merge_summary_value = sess.run(
					[optim, learning_rate, loss, logits, acc, merged_summary],
					feed_dict={input_x_char: x_char_batch,input_x_word:x_word_batch, input_y: y_batch})
				############# 输出 learning_rate
				if Begain_learn_rate != learn_rate:
					information = '############ New Learning_Rate {:6f} in step {:d}  ###########'.format(learn_rate,
																										  step)
					logging.info(information)
					print(information)
					Begain_learn_rate = learn_rate
				if step % 10 == 0:
					information = '## Epoch {:d} Step_Train / Total_Batch: {:d} / {:d}  train_loss= {:5f} train_acc={:5f}'. \
						format(epoch, step, num_batch, train_loss_value, train_acc)
					print(information)  ### 输出到屏幕
					logging.info(information)  ### 输出到log文件

				if step % 500 == 0:  ### 每 500 步进行一次验证,并保存最优模型
					val_acc_all = 0
					val_loss_all = 0
					val_step = 0
					batch_val = tools.batch_iter_Mix(val_char_X, val_word_X, val_Y, config.batch_size)  ### 生成批次数据
					for x_char_val,x_word_val, y_val in batch_val:
						if x_char_val.shape[0] < config.batch_size:
							pass
						else:
							_, val_loss_value, val_pred, val_acc, merge_summary_value = sess.run(
								[optim, loss, logits, acc, merged_summary], feed_dict={input_x_char: x_char_val,
													input_x_word:x_word_val,input_y: y_val})
							writer.add_summary(merge_summary_value, step)
							val_acc_all = val_acc_all + val_acc
							val_loss_all = val_loss_all + val_loss_value
							val_step += 1
					ave_acc = val_acc_all / val_step
					ave_loss = val_loss_all / val_step
					information = "############## Val_loss = {:5f}   Val_acc = {:5f}   #####################". \
						format(ave_loss, ave_acc)
					print(information)  ### 输出到屏幕
					logging.info(information)  ### 输出到log文件

					if (ave_acc - acc_begain) > 0.0005:
						saver.save(sess, model_save_path, global_step=step)
						acc_begain = ave_acc

详细代码,可以去我的 github 里面参考。记得给个星星哦

代码所用到的word2vec模型,本文这里公开了自己使用的word2vec模型,也可以自己根据自己的数据集制作词向量模型。 链接:
https://pan.baidu.com/s/1dYRIPVH1N0y0gNOCw8rAuQ 提取码:g889

具体训练情况如下:
在这里插入图片描述

最后

本小白的四系统较单一的CNN系统或者其他系统,其效果提升大概2个点以上(具体看应用场景)。相应的,其效率较其他单一系统略低。毕竟效率和效果之间总是要平衡的。
最后的最后,github网址:
https://github.com/Tian14267/TextClassify_FourSystems

欢迎各位英雄好汉 ,前来交流学习,共同进步。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值