为什么是这个不行

#!/usr/bin/env python
# encoding: utf-8

import numpy as np
import tensorflow as tf

from utils import load_w2v, batch_index, load_inputs_twitter, load_word_id_mapping


FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('embedding_size', 100, 'dimension of word embedding')
tf.app.flags.DEFINE_integer('batch_size', 8, 'number of example per batch')
tf.app.flags.DEFINE_integer('hidden_dim', 128, 'number of hidden unit')
tf.app.flags.DEFINE_float('learning_rate', 0.001, 'learning rate')
tf.app.flags.DEFINE_integer('num_classes', 2, 'number of distinct class')
tf.app.flags.DEFINE_integer('max_sentence_len', 100, 'max number of tokens per sentence')
tf.app.flags.DEFINE_float('l2_reg', 0.001, 'l2 regularization')
tf.app.flags.DEFINE_integer('display_step', 4, 'number of test display step')
tf.app.flags.DEFINE_integer('num_epochs', 35, 'number of train iter')

tf.app.flags.DEFINE_string('train_file_path', 'data/COAE2015/train.raw', 'training file')
tf.app.flags.DEFINE_string('validate_file_path', 'data/COAE2015/validate.raw', 'validating file')
tf.app.flags.DEFINE_string('test_file_path', 'data/COAE2015/test.raw', 'testing file')
tf.app.flags.DEFINE_string('embedding_file_path', 'data/COAE2015/COAE2015_word_embedding_partial_100.txt', 'embedding file')
tf.app.flags.DEFINE_string('word_id_file_path', 'data/COAE2015/word_id.txt', 'word-id mapping file')
tf.app.flags.DEFINE_string('type', '', 'model type: ''(default), TD or TC')



'''

def load_w2v(w2v_file, embedding_dim, is_skip=False):

    fp = open(w2v_file, 'r', encoding='utf-8',errors='ignore')
    if is_skip:
        fp.readline()
    w2v = []
    word_dict = dict()
    # [0,0,...,0] represent absent words
    w2v.append([0.] * embedding_dim)
    cnt = 0
    for line in fp:
        cnt += 1
        line = line.split()
        if len(line) != embedding_dim + 1:
            print (cnt)
            print (len(line))
            print ('a bad word embedding: {}'.format(line[0]))
            continue
        w2v.append([float(v) for v in line[1:]])
        word_dict[line[0]] = cnt
    w2v = np.asarray(w2v, dtype=np.float32)
    w2v = np.row_stack((w2v, np.sum(w2v, axis=0) / cnt))
    print (np.shape(w2v))
    word_dict['$t$'] = (cnt + 1)
    print (word_dict['$t$'], len(w2v))

    return word_dict, w2v


def load_inputs_twitter(input_file, word_id_file, sentence_len, type_='', encoding='utf8'):
    if type(word_id_file) is str:
        word_to_id = load_word_id_mapping(word_id_file)
    else:
        word_to_id = word_id_file
    print ('load word-to-id done!')

    x, y, sen_len = [], [], []
    x_r, sen_len_r = [], []
    target_words = []

    with open(input_file, 'r', encoding='utf-8',errors='ignore') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 3):

            target_word = lines[i + 1].lower().split()
            target_word =list(map(lambda w: word_to_id.get(w, 0), target_word))
            # map(func,seq) 就是将函数作用在序列的每个元素上,然后创建由函数返回值组成的列表。
            target_words.append([target_word[0]]) #就这句有问题
            y.append(lines[i + 2].strip().split()[0])
            words = lines[i].lower().split()
            words_l, words_r = [], []

            # 这段代码的意思是绕开,$t$
            flag = True
            for word in words:
                if word == '$t$':
                    flag = False
                    continue
                if flag:
                    if word in word_to_id:
                        words_l.append(word_to_id[word])
                else:
                    if word in word_to_id:
                        words_r.append(word_to_id[word])
            if type_ == 'TD' or type_ == 'TC':
                words_l.extend(target_word)
                sen_len.append(len(words_l))
                x.append(words_l + [0] * (sentence_len - len(words_l)))
                tmp = target_word + words_r
                tmp.reverse()
                sen_len_r.append(len(tmp))
                x_r.append(tmp + [0] * (sentence_len - len(tmp)))
            else:
                words = words_l + target_word + words_r
                sen_len.append(len(words))
                x.append(words + [0] * (sentence_len - len(words)))

    y = change_y_to_onehot(y)
    if type_ == 'TD':
        return np.asarray(x), np.asarray(sen_len), np.asarray(x_r), \
               np.asarray(sen_len_r), np.asarray(y)
    elif type_ == 'TC':
        return np.asarray(x), np.asarray(sen_len), np.asarray(x_r), \
               np.asarray(sen_len_r), np.asarray(y), np.asarray(target_words)
    else:
        return np.asarray(x), np.asarray(sen_len), np.asarray(y)




w2v_file = '/home/liangtianxin/folder/TextCnn_boe/TD-textcnn/data/COAE2015_tc/COAE2015_word_embedding_partial_100.txt'

embedding_dim =100

word_id_mapping, w2v = load_w2v(w2v_file, embedding_dim)


#input_file = '/home/liangtianxin/folder/TextCnn_boe/TD-textcnn/data/COAE2015_tc/test.raw'

input_file = '/home/liangtianxin/folder/TextCnn_boe/TD-textcnn/data/COAE2015_tc/train.raw'


word_id_file = word_id_mapping

sentence_len = 100
type_=''
encoding='utf8'

tr_x, tr_sen_len, tr_y = load_inputs_twitter(input_file, word_id_file, sentence_len, type_='', encoding='utf8')

'''




class TextCNN(object):
    
    
    def __init__(self,embedding_size =100,max_sentence_len = 100,num_classes = 2,num_filters = 128,filter_sizes = [2,3,4],hidden_dim = 128,keep_prob = 0.5,learning_rate = 0.001,batch_size =8,num_epochs = 35,type_=''):
        self.embedding_size = embedding_size
        self.num_classes = num_classes
        self.num_filters = num_filters
        self.filter_sizes = filter_sizes
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        self.max_sentence_len =max_sentence_len
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.type_ = type_
        
        self.word_id_mapping, self.w2v = load_w2v(FLAGS.embedding_file_path, self.embedding_size) 
        self.word_embedding = tf.constant(self.w2v, name='word_embedding')
        
        # Placeholders for input, output and dropout
        self.drop_keep_prob = tf.placeholder(tf.float32, name="keep_prob")
        with tf.name_scope('inputs'):
            self.input_x = tf.placeholder(tf.int32, [None, self.max_sentence_len], name="input_x")  
            self.input_y = tf.placeholder(tf.int32, [None, self.num_classes], name="input_y")      
            self.sen_len = tf.placeholder(tf.int32, None)
            

    def cnn(self,input_x):
        """CNN模型"""

        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            #W = tf.Variable(tf.random_uniform([self.vocab_size,self.embedding_size],-1.0,1.0),name="W")
            #self.embedded_chars = tf.nn.embedding_lookup(W,input_x) 
            self.embedded_chars = tf.nn.embedding_lookup(self.word_embedding, input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars,-1) 

        pooled_outputs = []
        for i,filter_size in enumerate(self.filter_sizes):
            with tf.name_scope("conv-maxpool-%s" %filter_size):
                # Convolution Layer
                filter_shape = [filter_size,self.embedding_size,1,self.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1),name="W")
                b = tf.Variable(tf.constant(0.1,shape=[self.num_filters]),name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,W,strides=[1,1,1,1],padding="VALID",
                    name="conv"
                )
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv,b),name="relu")
                # Max-pooling over the outputs
                pooled = tf.nn.max_pool(
                    h,ksize=[1,self.max_sentence_len - filter_size +1,1,1],
                    strides=[1,1,1,1],padding="VALID",name="pool"
                )
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = self.num_filters * len(self.filter_sizes)
        self.h_pool = tf.concat(pooled_outputs,3)
        self.h_pool_flat = tf.reshape(self.h_pool,[-1,num_filters_total])
        #dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat,self.drop_keep_prob) 
        #最终输出
        with tf.name_scope("output"):
            W = tf.Variable(tf.truncated_normal([num_filters_total,self.num_classes],stddev=0.1),name="W")
            b = tf.Variable(tf.constant(0.1,shape=[self.num_classes]),name="b")

            scores = tf.nn.xw_plus_b(self.h_drop,W,b,name="scores")
            y_pred_cls = tf.argmax(scores,1,name="prediction")  

        return scores,y_pred_cls

    def run(self):

        scores,y_pred_cls = self.cnn(self.input_x)
        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=self.input_y))
            global_step = tf.Variable(0, name="tr_global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(losses,global_step=global_step)
        # Accuracy
        with tf.name_scope("accuracy"):
            correct_pred = tf.equal(tf.argmax(scores, 1),tf.argmax(self.input_y, 1))
            accuracy = tf.reduce_sum(tf.cast(correct_pred, tf.float32)) 
            _acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            
        with tf.Session() as sess:
            summary_loss = tf.summary.scalar('loss', losses)
            summary_acc = tf.summary.scalar('acc', accuracy)
            train_summary_op = tf.summary.merge([summary_loss, summary_acc])
            validate_summary_op = tf.summary.merge([summary_loss, summary_acc])
            test_summary_op = tf.summary.merge([summary_loss, summary_acc])
            import time
            timestamp = str(int(time.time()))
            #_dir = ('logs/' + str(timestamp) + '_' + self.type_ + '_r' + str(self.learning_rate) + '_b' + str(self.batch_size) + '_l' + str(self.l2_reg))            
            _dir = ('logs/' + str(timestamp) + '_' + self.type_+ '_r' + str(self.learning_rate)+ '_b' + str(self.batch_size) + '_l' )
            train_summary_writer = tf.summary.FileWriter(_dir + '/train', sess.graph)
            test_summary_writer = tf.summary.FileWriter(_dir + '/test', sess.graph)
            validate_summary_writer = tf.summary.FileWriter(_dir + '/validate', sess.graph)
            
            print ('load train')
            tr_x, tr_sen_len, tr_y = load_inputs_twitter(
                FLAGS.train_file_path,
                self.word_id_mapping,
                self.max_sentence_len
            )
            print ('load test')
            te_x, te_sen_len, te_y = load_inputs_twitter(
                FLAGS.test_file_path,
                self.word_id_mapping,
                self.max_sentence_len
            )

            init = tf.global_variables_initializer()
            sess.run(init)

            max_acc = 0.
            for i in range(self.num_epochs):
                for train, _ in self.get_batch_data(tr_x, tr_y, tr_sen_len, self.batch_size, 0.5):
                    _, step, summary = sess.run([optimizer, global_step, train_summary_op], feed_dict=train)
                    train_summary_writer.add_summary(summary, step)

                acc, loss, cnt = 0., 0., 0
                
                for test, num in self.get_batch_data(te_x, te_y, te_sen_len, 1000, 1.0):
                    _loss, _acc, summary = sess.run([losses, accuracy, test_summary_op], feed_dict=test)
                    acc += _acc
                    loss += _loss * num
                    cnt += num
                     
                print (cnt)
                print (acc)
                test_summary_writer.add_summary(summary, step)
                print ('Iter {}: mini-batch loss={:.6f}, test acc={:.6f}'.format(step, loss/cnt, acc/cnt))
                test_summary_writer.add_summary(summary, step)
                if acc / cnt > max_acc:
                    max_acc = acc / cnt
                
                  
            print ('Optimization Finished! Max acc={}'.format(max_acc))

            print ('Learning_rate={}, iter_num={}, batch_size={}, hidden_num={}'.format(
                self.learning_rate,
                self.num_epochs,
                self.batch_size,
                self.hidden_dim
            ))

    def get_batch_data(self, x, y, sen_len, batch_size, keep_prob):
        for index in batch_index(len(y), batch_size, 1):
            feed_dict = {
                self.input_x: x[index],
                self.input_y: y[index],
                self.sen_len: sen_len[index],
                self.drop_keep_prob: keep_prob,
            }
            yield feed_dict, len(index)



def main(_):
    textCNN = TextCNN(
        embedding_size=FLAGS.embedding_size,
        batch_size=FLAGS.batch_size,
        hidden_dim=FLAGS.hidden_dim,
        learning_rate=FLAGS.learning_rate,
        max_sentence_len= FLAGS.max_sentence_len,
        num_classes=FLAGS.num_classes,
        num_epochs=FLAGS.num_epochs,
        type_=FLAGS.type
    )
    textCNN.run()


if __name__ == '__main__':
    tf.app.run()


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值