Emotion-Cause Pair Extraction:A New Task to Emotion Analysis in Texts代码笔记

一. 加载语料和预训练词向量(load_w2v)

Step 1. 加载训练语料(clause_keywords.csv)并统计emotion和cause字段覆盖的中文词words
    words = []
    inputFile1 = open(train_file_path, 'r', encoding='utf-8')
    for line in inputFile1.readlines():
        line = line.strip().split(',')
        emotion, clause = line[2], line[-1]
        words.extend([emotion] + clause.split())

clause_keywords.csv
在这里插入图片描述

Step 2. 中文词words去重并生成 词-id与id-词 索引字典
    words = set(words)  # 所有不重复词的集合
    word_idx = dict((c, k + 1) for k, c in enumerate(words))  # 每个词及词的位置
    word_idx_rev = dict((k + 1, c) for k, c in enumerate(words))  # 每个词及词的位置

word_idx与word_idx_rev
在这里插入图片描述
在这里插入图片描述

Step 3. 加载word2vec文件(w2v_200.txt)并生成词和对应词向量的映射字典w2v
    w2v = {}
    inputFile2 = open(embedding_path, 'r', encoding='utf-8')
    inputFile2.readline()
    for line in inputFile2.readlines():
        line = line.strip().split(' ')
        w, ebd = line[0], line[1:]
        w2v[w] = ebd

w2v_200.txt
在这里插入图片描述

w2v
在这里插入图片描述

Step 4. 将训练语料中去重后的词转成词向量。遍历words,如果词在w2v中,就取对应的词向量,否则从均匀分布[-0.1,0.1]中随机取个200维向量
embedding = [list(np.zeros(embedding_dim))]
    hit = 0
    for item in words:
        if item in w2v:
            vec = list(map(float, w2v[item]))
            hit += 1
        else:
            vec = list(np.random.rand(embedding_dim) / 5. - 0.1)  # 从均匀分布[-0.1,0.1]中随机取
        embedding.append(vec)
    print('w2v_file: {}\nall_words: {} hit_words: {}'.format(embedding_path, len(words), hit))

embedding
在这里插入图片描述

Step 5. 初始化位置向量
    embedding_pos = [list(np.zeros(embedding_dim_pos))]
    embedding_pos.extend([list(np.random.normal(loc=0.0, scale=0.1, size=embedding_dim_pos)) for i in range(200)])

    embedding, embedding_pos = np.array(embedding), np.array(embedding_pos)

    print("embedding.shape: {} embedding_pos.shape: {}".format(embedding.shape, embedding_pos.shape))
    print("load embedding done!\n")

在这里插入图片描述

二. 模型构建(build_model)

Step 1. 输入x转词向量,得到inputs
    x = tf.nn.embedding_lookup(word_embedding, x) ## (?,75,30,200)
    inputs = tf.reshape(x, [-1, FLAGS.max_sen_len, FLAGS.embedding_dim])  ## (?,30,200)
    inputs = tf.nn.dropout(inputs, keep_prob=keep_prob1)
Step 2. 对inputs进行BiLSTM编码以及注意力计算
with tf.name_scope('word_encode'):
    inputs = RNN(inputs, sen_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'word_layer' + name)
with tf.name_scope('word_attention'):
    sh2 = 2 * FLAGS.n_hidden
    w1 = get_weight_varible('word_att_w1' + name, [sh2, sh2])
    b1 = get_weight_varible('word_att_b1' + name, [sh2])
    w2 = get_weight_varible('word_att_w2' + name, [sh2, 1])
    s = att_var(inputs, sen_len, w1, b1, w2)
s = tf.reshape(s, [-1, FLAGS.max_doc_len, 2 * FLAGS.n_hidden])
def biLSTM(inputs, length, n_hidden, scope):
    ''' 
    input shape:[batch_size, max_len, embedding_dim]
    length shape:[batch_size]
    return shape:[batch_size, max_len, n_hidden*2]
    '''
    outputs, state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=tf.contrib.rnn.LSTMCell(n_hidden),
        cell_bw=tf.contrib.rnn.LSTMCell(n_hidden),
        inputs=inputs,
        sequence_length=length,
        dtype=tf.float32,
        scope=scope
    )

    return tf.concat(outputs, 2)
def att_var(inputs, length, w1, b1, w2):
    ''' 
    input shape:[batch_size, max_len, n_hidden]
    length shape:[batch_size]
    return shape:[batch_size, n_hidden]
    '''
    max_len, n_hidden = (tf.shape(inputs)[1], tf.shape(inputs)[2])
    tmp = tf.reshape(inputs, [-1, n_hidden])
    u = tf.tanh(tf.matmul(tmp, w1) + b1)
    alpha = tf.reshape(tf.matmul(u, w2), [-1, 1, max_len])
    alpha = softmax_by_length(alpha, length)
    return tf.reshape(tf.matmul(alpha, inputs), [-1, n_hidden])
Step 3. 再经过一层BiLSTM层,然后经过softmax层给出预测结果

## cause预测
s = get_s(inputs, name='cause_word_encode')
s = RNN(s, doc_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'cause_sentence_layer')
with tf.name_scope('sequence_prediction'):
    s1 = tf.reshape(s, [-1, 2 * FLAGS.n_hidden])
    s1 = tf.nn.dropout(s1, keep_prob=keep_prob2)

    w_cause = get_weight_varible('softmax_w_cause', [2 * FLAGS.n_hidden, FLAGS.n_class])
    b_cause = get_weight_varible('softmax_b_cause', [FLAGS.n_class])
    pred_cause = tf.nn.softmax(tf.matmul(s1, w_cause) + b_cause)
    pred_cause = tf.reshape(pred_cause, [-1, FLAGS.max_doc_len, FLAGS.n_class])

## emotion预测
s = get_s(inputs, name='pos_word_encode')
s = RNN(s, doc_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'pos_sentence_layer')
with tf.name_scope('sequence_prediction'):
    s1 = tf.reshape(s, [-1, 2 * FLAGS.n_hidden])
    s1 = tf.nn.dropout(s1, keep_prob=keep_prob2)

    w_pos = get_weight_varible('softmax_w_pos', [2 * FLAGS.n_hidden, FLAGS.n_class])
    b_pos = get_weight_varible('softmax_b_pos', [FLAGS.n_class])
    pred_pos = tf.nn.softmax(tf.matmul(s1, w_pos) + b_pos)
    pred_pos = tf.reshape(pred_pos, [-1, FLAGS.max_doc_len, FLAGS.n_class])
Step 4. 计算emotion、cause权重的l2正则损失和
reg = tf.nn.l2_loss(w_cause) + tf.nn.l2_loss(b_cause)
reg += tf.nn.l2_loss(w_pos) + tf.nn.l2_loss(b_pos)
Step 5. 计算总损失(emotion + cause + l2),并创建优化器Optimizer
pred_pos, pred_cause, reg = build_model(word_embedding, x, sen_len, doc_len, keep_prob1, keep_prob2, y_position, y_cause)
valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
loss_pos = - tf.reduce_sum(y_position * tf.log(pred_pos)) / valid_num  ## emotion loss
loss_cause = - tf.reduce_sum(y_cause * tf.log(pred_cause)) / valid_num  ## cause loss
loss_op = loss_cause * FLAGS.cause + loss_pos * FLAGS.pos + reg * FLAGS.l2_reg  ## total loss
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate).minimize(loss_op)  ## 优化器
Step 6. 结果解析
true_y_cause_op = tf.argmax(y_cause, 2)  ## cause实际值
pred_y_cause_op = tf.argmax(pred_cause, 2)  ## cause预测值
true_y_pos_op = tf.argmax(y_position, 2)  ## emotion实际值
pred_y_pos_op = tf.argmax(pred_pos, 2)  ## emotion预测值

三. 模型训练、评估(10折,每折迭代15个epoch)

Step 1. 加载训练集、测试集
train_file_name = 'fold{}_train.txt'.format(fold)
test_file_name = 'fold{}_test.txt'.format(fold)
tr_doc_id, tr_y_position, tr_y_cause, tr_y_pairs, tr_x, tr_sen_len, tr_doc_len = load_data(
    'data_combine/' + train_file_name, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
te_doc_id, te_y_position, te_y_cause, te_y_pairs, te_x, te_sen_len, te_doc_len = load_data(
    'data_combine/' + test_file_name, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
def load_data(input_file, word_idx, max_doc_len=75, max_sen_len=45):
    print('load data_file: {}'.format(input_file))
    y_position, y_cause, y_pairs, x, sen_len, doc_len = [], [], [], [], [], []
    doc_id = []

    n_cut = 0
    inputFile = open(input_file, 'r', encoding='utf-8')
    while True:
        line = inputFile.readline()
        if line == '': break
        line = line.strip().split()
        doc_id.append(line[0])
        d_len = int(line[1])
        pairs = eval('[' + inputFile.readline().strip() + ']')
        doc_len.append(d_len)
        y_pairs.append(pairs)
        pos, cause = zip(*pairs)
        y_po, y_ca, sen_len_tmp, x_tmp = np.zeros((max_doc_len, 2)), np.zeros((max_doc_len, 2)), np.zeros(max_doc_len,
                                                                                                          dtype=np.int32), np.zeros(
            (max_doc_len, max_sen_len), dtype=np.int32)
        for i in range(d_len):
            y_po[i][int(i + 1 in pos)] = 1
            y_ca[i][int(i + 1 in cause)] = 1
            words = inputFile.readline().strip().split(',')[-1]
            sen_len_tmp[i] = min(len(words.split()), max_sen_len)
            for j, word in enumerate(words.split()):
                if j >= max_sen_len:
                    n_cut += 1
                    break
                x_tmp[i][j] = int(word_idx[word])

        y_position.append(y_po)
        y_cause.append(y_ca)
        x.append(x_tmp)
        sen_len.append(sen_len_tmp)

    y_position, y_cause, x, sen_len, doc_len = map(np.array, [y_position, y_cause, x, sen_len, doc_len])
    for var in ['y_position', 'y_cause', 'x', 'sen_len', 'doc_len']:
        print('{}.shape {}'.format(var, eval(var).shape))
    print('n_cut {}'.format(n_cut))
    print('load data done!\n')
    return doc_id, y_position, y_cause, y_pairs, x, sen_len, doc_len
Step 2. 模型训练
# train
for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, FLAGS.keep_prob1, FLAGS.keep_prob2,
                                tr_y_position, tr_y_cause, FLAGS.batch_size):
    _, loss, pred_y_cause, true_y_cause, pred_y_pos, true_y_pos, doc_len_batch = sess.run(
        [optimizer, loss_op, pred_y_cause_op, true_y_cause_op, pred_y_pos_op, true_y_pos_op, doc_len],
        feed_dict=dict(zip(placeholders, train)))
    if step % 10 == 0:
        print('step {}: train loss {:.4f} '.format(step, loss))
        acc, p, r, f1 = acc_prf(pred_y_cause, true_y_cause, doc_len_batch)
        print('cause_predict: train acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
        acc, p, r, f1 = acc_prf(pred_y_pos, true_y_pos, doc_len_batch)
        print('position_predict: train acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
    step = step + 1
Step 3. 模型评估(测试)
# test
test = [te_x, te_sen_len, te_doc_len, 1., 1., te_y_position, te_y_cause]
loss, pred_y_cause, true_y_cause, pred_y_pos, true_y_pos, doc_len_batch = sess.run(
    [loss_op, pred_y_cause_op, true_y_cause_op, pred_y_pos_op, true_y_pos_op, doc_len],
    feed_dict=dict(zip(placeholders, test)))
print('\nepoch {}: test loss {:.4f} cost time: {:.1f}s\n'.format(i, loss, time.time() - start_time))

acc, p, r, f1 = acc_prf(pred_y_cause, true_y_cause, doc_len_batch)
result_avg_cause = [acc, p, r, f1]
if f1 > max_f1_cause:
    max_acc_cause, max_p_cause, max_r_cause, max_f1_cause = acc, p, r, f1
print('cause_predict: test acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
print('max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(max_acc_cause, max_p_cause,
                                                                        max_r_cause, max_f1_cause))

acc, p, r, f1 = acc_prf(pred_y_pos, true_y_pos, doc_len_batch)
result_avg_pos = [acc, p, r, f1]
if f1 > max_f1_pos:
    max_acc_pos, max_p_pos, max_r_pos, max_f1_pos = acc, p, r, f1
print('position_predict: test acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
print(
    'max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(max_acc_pos, max_p_pos, max_r_pos,
                                                                        max_f1_pos))

if (result_avg_cause[-1] + result_avg_pos[-1]) / 2. > max_f1_avg:
    max_f1_avg = (result_avg_cause[-1] + result_avg_pos[-1]) / 2.
    result_avg_cause_max = result_avg_cause
    result_avg_pos_max = result_avg_pos

    te_pred_y_cause, te_pred_y_pos = pred_y_cause, pred_y_pos
    tr_pred_y_cause, tr_pred_y_pos = [], []
    for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, 1., 1., tr_y_position, tr_y_cause, 200,
                                    test=True):
        pred_y_cause, pred_y_pos = sess.run([pred_y_cause_op, pred_y_pos_op],
                                            feed_dict=dict(zip(placeholders, train)))
        tr_pred_y_cause.extend(list(pred_y_cause))
        tr_pred_y_pos.extend(list(pred_y_pos))
print('Average max cause: max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}'.format(
    result_avg_cause_max[0], result_avg_cause_max[1], result_avg_cause_max[2], result_avg_cause_max[3]))
print('Average max pos: max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(
    result_avg_pos_max[0], result_avg_pos_max[1], result_avg_pos_max[2], result_avg_pos_max[3]))
Step 4. 结果回写与打印
def get_pair_data(file_name, doc_id, doc_len, y_pairs, pred_y_cause, pred_y_pos, x, sen_len, word_idx_rev):
    g = open(file_name, 'w', encoding='utf-8')
    for i in range(len(doc_id)):
        g.write(doc_id[i] + ' ' + str(doc_len[i]) + '\n')
        g.write(str(y_pairs[i]) + '\n')
        for j in range(doc_len[i]):
            clause = ''
            for k in range(sen_len[i][j]):
                clause = clause + word_idx_rev[x[i][j][k]] + ' '
            g.write(str(j + 1) + ', ' + str(pred_y_pos[i][j]) + ', ' + str(
                pred_y_cause[i][j]) + ', ' + clause + '\n')
    print(f"write {file_name} done")

get_pair_data(save_dir + test_file_name, te_doc_id, te_doc_len, te_y_pairs, te_pred_y_cause, te_pred_y_pos,
                te_x, te_sen_len, word_idx_rev)
get_pair_data(save_dir + train_file_name, tr_doc_id, tr_doc_len, tr_y_pairs, tr_pred_y_cause, tr_pred_y_pos,
                tr_x, tr_sen_len, word_idx_rev)

print('Optimization Finished!\n')
print('############# fold {} end ###############'.format(fold))
# fold += 1
acc_cause_list.append(result_avg_cause_max[0])
p_cause_list.append(result_avg_cause_max[1])
r_cause_list.append(result_avg_cause_max[2])
f1_cause_list.append(result_avg_cause_max[3])
acc_pos_list.append(result_avg_pos_max[0])
p_pos_list.append(result_avg_pos_max[1])
r_pos_list.append(result_avg_pos_max[2])
f1_pos_list.append(result_avg_pos_max[3])
  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值