#!/usr/bin/env python
# encoding: utf-8
import numpy as np
import tensorflow as tf
from utils import load_w2v, batch_index, load_inputs_twitter, load_word_id_mapping
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('embedding_size', 100, 'dimension of word embedding')
tf.app.flags.DEFINE_integer('batch_size', 8, 'number of example per batch')
tf.app.flags.DEFINE_integer('hidden_dim', 128, 'number of hidden unit')
tf.app.flags.DEFINE_float('learning_rate', 0.001, 'learning rate')
tf.app.flags.DEFINE_integer('num_classes', 2, 'number of distinct class')
tf.app.flags.DEFINE_integer('max_sentence_len', 100, 'max number of tokens per sentence')
tf.app.flags.DEFINE_float('l2_reg', 0.001, 'l2 regularization')
tf.app.flags.DEFINE_integer('display_step', 4, 'number of test display step')
tf.app.flags.DEFINE_integer('num_epochs', 35, 'number of train iter')
tf.app.flags.DEFINE_string('train_file_path', 'data/COAE2015/train.raw', 'training file')
tf.app.flags.DEFINE_string('validate_file_path', 'data/COAE2015/validate.raw', 'validating file')
tf.app.flags.DEFINE_string('test_file_path', 'data/COAE2015/test.raw', 'testing file')
tf.app.flags.DEFINE_string('embedding_file_path', 'data/COAE2015/COAE2015_word_embedding_partial_100.txt', 'embedding file')
tf.app.flags.DEFINE_string('word_id_file_path', 'data/COAE2015/word_id.txt', 'word-id mapping file')
tf.app.flags.DEFINE_string('type', '', 'model type: ''(default), TD or TC')
'''
def load_w2v(w2v_file, embedding_dim, is_skip=False):
fp = open(w2v_file, 'r', encoding='utf-8',errors='ignore')
if is_skip:
fp.readline()
w2v = []
word_dict = dict()
# [0,0,...,0] represent absent words
w2v.append([0.] * embedding_dim)
cnt = 0
for line in fp:
cnt += 1
line = line.split()
if len(line) != embedding_dim + 1:
print (cnt)
print (len(line))
print ('a bad word embedding: {}'.format(line[0]))
continue
w2v.append([float(v) for v in line[1:]])
word_dict[line[0]] = cnt
w2v = np.asarray(w2v, dtype=np.float32)
w2v = np.row_stack((w2v, np.sum(w2v, axis=0) / cnt))
print (np.shape(w2v))
word_dict['$t$'] = (cnt + 1)
print (word_dict['$t$'], len(w2v))
return word_dict, w2v
def load_inputs_twitter(input_file, word_id_file, sentence_len, type_='', encoding='utf8'):
if type(word_id_file) is str:
word_to_id = load_word_id_mapping(word_id_file)
else:
word_to_id = word_id_file
print ('load word-to-id done!')
x, y, sen_len = [], [], []
x_r, sen_len_r = [], []
target_words = []
with open(input_file, 'r', encoding='utf-8',errors='ignore') as f:
lines = f.readlines()
for i in range(0, len(lines), 3):
target_word = lines[i + 1].lower().split()
target_word =list(map(lambda w: word_to_id.get(w, 0), target_word))
# map(func,seq) 就是将函数作用在序列的每个元素上,然后创建由函数返回值组成的列表。
target_words.append([target_word[0]]) #就这句有问题
y.append(lines[i + 2].strip().split()[0])
words = lines[i].lower().split()
words_l, words_r = [], []
# 这段代码的意思是绕开,$t$
flag = True
for word in words:
if word == '$t$':
flag = False
continue
if flag:
if word in word_to_id:
words_l.append(word_to_id[word])
else:
if word in word_to_id:
words_r.append(word_to_id[word])
if type_ == 'TD' or type_ == 'TC':
words_l.extend(target_word)
sen_len.append(len(words_l))
x.append(words_l + [0] * (sentence_len - len(words_l)))
tmp = target_word + words_r
tmp.reverse()
sen_len_r.append(len(tmp))
x_r.append(tmp + [0] * (sentence_len - len(tmp)))
else:
words = words_l + target_word + words_r
sen_len.append(len(words))
x.append(words + [0] * (sentence_len - len(words)))
y = change_y_to_onehot(y)
if type_ == 'TD':
return np.asarray(x), np.asarray(sen_len), np.asarray(x_r), \
np.asarray(sen_len_r), np.asarray(y)
elif type_ == 'TC':
return np.asarray(x), np.asarray(sen_len), np.asarray(x_r), \
np.asarray(sen_len_r), np.asarray(y), np.asarray(target_words)
else:
return np.asarray(x), np.asarray(sen_len), np.asarray(y)
w2v_file = '/home/liangtianxin/folder/TextCnn_boe/TD-textcnn/data/COAE2015_tc/COAE2015_word_embedding_partial_100.txt'
embedding_dim =100
word_id_mapping, w2v = load_w2v(w2v_file, embedding_dim)
#input_file = '/home/liangtianxin/folder/TextCnn_boe/TD-textcnn/data/COAE2015_tc/test.raw'
input_file = '/home/liangtianxin/folder/TextCnn_boe/TD-textcnn/data/COAE2015_tc/train.raw'
word_id_file = word_id_mapping
sentence_len = 100
type_=''
encoding='utf8'
tr_x, tr_sen_len, tr_y = load_inputs_twitter(input_file, word_id_file, sentence_len, type_='', encoding='utf8')
'''
class TextCNN(object):
def __init__(self,embedding_size =100,max_sentence_len = 100,num_classes = 2,num_filters = 128,filter_sizes = [2,3,4],hidden_dim = 128,keep_prob = 0.5,learning_rate = 0.001,batch_size =8,num_epochs = 35,type_=''):
self.embedding_size = embedding_size
self.num_classes = num_classes
self.num_filters = num_filters
self.filter_sizes = filter_sizes
self.hidden_dim = hidden_dim
self.learning_rate = learning_rate
self.max_sentence_len =max_sentence_len
self.batch_size = batch_size
self.num_epochs = num_epochs
self.type_ = type_
self.word_id_mapping, self.w2v = load_w2v(FLAGS.embedding_file_path, self.embedding_size)
self.word_embedding = tf.constant(self.w2v, name='word_embedding')
# Placeholders for input, output and dropout
self.drop_keep_prob = tf.placeholder(tf.float32, name="keep_prob")
with tf.name_scope('inputs'):
self.input_x = tf.placeholder(tf.int32, [None, self.max_sentence_len], name="input_x")
self.input_y = tf.placeholder(tf.int32, [None, self.num_classes], name="input_y")
self.sen_len = tf.placeholder(tf.int32, None)
def cnn(self,input_x):
"""CNN模型"""
with tf.device('/cpu:0'), tf.name_scope("embedding"):
#W = tf.Variable(tf.random_uniform([self.vocab_size,self.embedding_size],-1.0,1.0),name="W")
#self.embedded_chars = tf.nn.embedding_lookup(W,input_x)
self.embedded_chars = tf.nn.embedding_lookup(self.word_embedding, input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars,-1)
pooled_outputs = []
for i,filter_size in enumerate(self.filter_sizes):
with tf.name_scope("conv-maxpool-%s" %filter_size):
# Convolution Layer
filter_shape = [filter_size,self.embedding_size,1,self.num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1),name="W")
b = tf.Variable(tf.constant(0.1,shape=[self.num_filters]),name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded,W,strides=[1,1,1,1],padding="VALID",
name="conv"
)
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv,b),name="relu")
# Max-pooling over the outputs
pooled = tf.nn.max_pool(
h,ksize=[1,self.max_sentence_len - filter_size +1,1,1],
strides=[1,1,1,1],padding="VALID",name="pool"
)
pooled_outputs.append(pooled)
# Combine all the pooled features
num_filters_total = self.num_filters * len(self.filter_sizes)
self.h_pool = tf.concat(pooled_outputs,3)
self.h_pool_flat = tf.reshape(self.h_pool,[-1,num_filters_total])
#dropout
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat,self.drop_keep_prob)
#最终输出
with tf.name_scope("output"):
W = tf.Variable(tf.truncated_normal([num_filters_total,self.num_classes],stddev=0.1),name="W")
b = tf.Variable(tf.constant(0.1,shape=[self.num_classes]),name="b")
scores = tf.nn.xw_plus_b(self.h_drop,W,b,name="scores")
y_pred_cls = tf.argmax(scores,1,name="prediction")
return scores,y_pred_cls
def run(self):
scores,y_pred_cls = self.cnn(self.input_x)
# CalculateMean cross-entropy loss
with tf.name_scope("loss"):
losses = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=self.input_y))
global_step = tf.Variable(0, name="tr_global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(losses,global_step=global_step)
# Accuracy
with tf.name_scope("accuracy"):
correct_pred = tf.equal(tf.argmax(scores, 1),tf.argmax(self.input_y, 1))
accuracy = tf.reduce_sum(tf.cast(correct_pred, tf.float32))
_acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
with tf.Session() as sess:
summary_loss = tf.summary.scalar('loss', losses)
summary_acc = tf.summary.scalar('acc', accuracy)
train_summary_op = tf.summary.merge([summary_loss, summary_acc])
validate_summary_op = tf.summary.merge([summary_loss, summary_acc])
test_summary_op = tf.summary.merge([summary_loss, summary_acc])
import time
timestamp = str(int(time.time()))
#_dir = ('logs/' + str(timestamp) + '_' + self.type_ + '_r' + str(self.learning_rate) + '_b' + str(self.batch_size) + '_l' + str(self.l2_reg))
_dir = ('logs/' + str(timestamp) + '_' + self.type_+ '_r' + str(self.learning_rate)+ '_b' + str(self.batch_size) + '_l' )
train_summary_writer = tf.summary.FileWriter(_dir + '/train', sess.graph)
test_summary_writer = tf.summary.FileWriter(_dir + '/test', sess.graph)
validate_summary_writer = tf.summary.FileWriter(_dir + '/validate', sess.graph)
print ('load train')
tr_x, tr_sen_len, tr_y = load_inputs_twitter(
FLAGS.train_file_path,
self.word_id_mapping,
self.max_sentence_len
)
print ('load test')
te_x, te_sen_len, te_y = load_inputs_twitter(
FLAGS.test_file_path,
self.word_id_mapping,
self.max_sentence_len
)
init = tf.global_variables_initializer()
sess.run(init)
max_acc = 0.
for i in range(self.num_epochs):
for train, _ in self.get_batch_data(tr_x, tr_y, tr_sen_len, self.batch_size, 0.5):
_, step, summary = sess.run([optimizer, global_step, train_summary_op], feed_dict=train)
train_summary_writer.add_summary(summary, step)
acc, loss, cnt = 0., 0., 0
for test, num in self.get_batch_data(te_x, te_y, te_sen_len, 1000, 1.0):
_loss, _acc, summary = sess.run([losses, accuracy, test_summary_op], feed_dict=test)
acc += _acc
loss += _loss * num
cnt += num
print (cnt)
print (acc)
test_summary_writer.add_summary(summary, step)
print ('Iter {}: mini-batch loss={:.6f}, test acc={:.6f}'.format(step, loss/cnt, acc/cnt))
test_summary_writer.add_summary(summary, step)
if acc / cnt > max_acc:
max_acc = acc / cnt
print ('Optimization Finished! Max acc={}'.format(max_acc))
print ('Learning_rate={}, iter_num={}, batch_size={}, hidden_num={}'.format(
self.learning_rate,
self.num_epochs,
self.batch_size,
self.hidden_dim
))
def get_batch_data(self, x, y, sen_len, batch_size, keep_prob):
for index in batch_index(len(y), batch_size, 1):
feed_dict = {
self.input_x: x[index],
self.input_y: y[index],
self.sen_len: sen_len[index],
self.drop_keep_prob: keep_prob,
}
yield feed_dict, len(index)
def main(_):
textCNN = TextCNN(
embedding_size=FLAGS.embedding_size,
batch_size=FLAGS.batch_size,
hidden_dim=FLAGS.hidden_dim,
learning_rate=FLAGS.learning_rate,
max_sentence_len= FLAGS.max_sentence_len,
num_classes=FLAGS.num_classes,
num_epochs=FLAGS.num_epochs,
type_=FLAGS.type
)
textCNN.run()
if __name__ == '__main__':
tf.app.run()