Estimator实现transition based dependency parsing
import numpy as np
import math
import tensorflow as tf
import cjdpy
WORD_NUM = 6
def model_fn(features, labels, mode):
# word_lookup = tf.truncated_normal(shape=[14871, 100], mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name=None)
# pos_lookup = tf.truncated_normal(shape=[79, 100], mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name=None)
pos_lookup = tf.Variable(posEmbeddingMatrix, trainable=True)
word_lookup = tf.Variable(wordEmbeddingMatrix, trainable=False)
print("feature x.shape: ", features["x"].shape)
out1 = tf.nn.embedding_lookup(word_lookup, features["x"][:, 0:WORD_NUM])
out2 = tf.nn.embedding_lookup(pos_lookup, features["x"][:, WORD_NUM:2*WORD_NUM])
print(out1)
print(out2)
# 全部特征concat
out3 = tf.concat([tf.layers.flatten(out1), tf.layers.flatten(out2)], axis=1)
out4 = tf.layers.dense(out3, 500, "relu")
logits = tf.layers.dense(out4, LABEL_SIZE)
# lstm
# out3 = tf.concat([out1, out2], axis=2)
# out4 = tf.keras.layers.LSTM(256)(out3)
# out5 = tf.layers.dense(tf.layers.flatten(out4), 500, "relu")
# logits = tf.layers.dense(out5, LABEL_SIZE)
# pos only
# out3 = tf.layers.dense(tf.layers.flatten(out2), 200, "relu")
# logits = tf.layers.dense(out3, LABEL_SIZE)
# pos only + lstm
# out3 = tf.keras.layers.LSTM(64)(out2)
# out4 = tf.layers.dense(tf.layers.flatten(out3), 200, "relu")
# logits = tf.layers.dense(out4, LABEL_SIZE)
if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
loss_ori = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
# loss = tf.reduce_mean(tf.multiply(features["weight"], loss_ori))
loss = tf.reduce_mean(loss_ori)
# 获取训练全局参数step
global_step = tf.train.get_global_step()
# 梯度下降算法,学习率是0.01
optimizer = tf.train.GradientDescentOptimizer(0.01)
# 将优化器和全局step的累加方法打包成一个方法组,相当于把若干个方法打包成事务执行的模式
train = tf.group(optimizer.minimize(loss), tf.assign_add(global_step, 1))
# 将所有内容封装成符合tf.estimator.Estimator规范的对象
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = { # cjd for export model
"y": tf.argmax(logits, axis=1)
}
print("tf.argmax(logits, axis=1): ", tf.argmax(logits, axis=1))
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions)
def serving_input_fn():
x = tf.placeholder(tf.int64, [None, 2*WORD_NUM], name='x')
input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
'x': x,
})()
return input_fn
wordVocab = []
posVocab = []
label = []
x_train_pre = []
y_train_pre = []
train_weight = []
eval_weight = []
# vocab
# wordVocab = cjdpy.load_list("data/wordVocab.txt")
data = cjdpy.load_csv("data/train.txt")
posVocab = cjdpy.load_list("data/posVocab.txt")
label = list(set([item[-1] for item in data]))
LABEL_SIZE = len(label)
label.sort()
# print("word vocab size: ", len(wordVocab))
print("pos vocab size: ", len(posVocab))
print("label size: ", len(label))
# w2id = {wordVocab[i]: i for i in range(len(wordVocab))}
p2id = {posVocab[i]: i for i in range(len(posVocab))}
label2id = {label[i]: i for i in range(len(label))}
def create_class_weight(labels_dict, mu=0.15):
total = 0
for key in labels_dict:
total += labels_dict[key]
# total = np.sum(labels_dict.values())
print(total)
keys = labels_dict.keys()
class_weight = dict()
for key in keys:
score = math.log(mu*total/float(labels_dict[key]))
class_weight[key] = score if score > 1.0 else 1.0
return class_weight
labels_dict = {}
for item in data:
if item[-1] in labels_dict:
labels_dict[item[-1]] += 1
else:
labels_dict[item[-1]] = 1
class_weight = create_class_weight(labels_dict)
print("class weight: ", class_weight)
# load embedding
posWord2vec = cjdpy.load_csv("posWord2vec.txt", ' ')
posWord2vec_dict = {}
for item in posWord2vec:
posWord2vec_dict[item[0]] = [float(x) for x in item[1:]]
posEmbeddingMatrix = []
for pos in posVocab:
if pos in posWord2vec_dict:
posEmbeddingMatrix.append(posWord2vec_dict[pos])
else:
posEmbeddingMatrix.append([0 for i in range(100)])
wordEmbedding = cjdpy.load_csv("word_embedding", ' ')
wordVocab = ['UNK', 'PAD']
wordEmbeddingMatrix=[]
wordEmbeddingMatrix.append([0 for i in range(300)])
wordEmbeddingMatrix.append([0 for i in range(300)])
for i, item in enumerate(wordEmbedding):
if i == 0: continue
wordEmbeddingMatrix.append([float(x) for x in item[1:]])
wordVocab.append(item[0])
print("word vocab size: ", len(wordVocab))
w2id = {wordVocab[i]: i for i in range(len(wordVocab))}
# make trainX, trainY, evalX, evalY
for item in data:
tmp = []
for i in range(WORD_NUM):
element = w2id[item[i]] if item[i] in w2id else w2id['UNK']
tmp.append(element)
for i in range(WORD_NUM, 2*WORD_NUM, 1):
element = p2id[item[i]] if item[i] in p2id else p2id['UNK']
tmp.append(element)
x_train_pre.append(tmp)
y_train_pre.append(label2id[item[-1]])
train_weight.append(class_weight[item[-1]])
x_train = np.array(x_train_pre)
y_train = np.array(y_train_pre)
print("trainX examples", x_train_pre[:5])
print("trainY examples ", y_train_pre[:5])
print("trainX shape: ", x_train.shape)
print("trainY shape: ", y_train.shape)
data = cjdpy.load_csv("data/dev.txt")
x_dev_pre = []
y_dev_pre = []
for item in data:
tmp = []
for i in range(WORD_NUM):
element = w2id[item[i]] if item[i] in w2id else w2id['UNK']
tmp.append(element)
for i in range(WORD_NUM, 2*WORD_NUM, 1):
element = p2id[item[i]] if item[i] in p2id else p2id['UNK']
tmp.append(element)
x_dev_pre.append(tmp)
y_dev_pre.append(label2id[item[-1]])
eval_weight.append(class_weight[item[-1]])
x_eval = np.array(x_dev_pre)
y_eval = np.array(y_dev_pre)
tf.logging.set_verbosity(tf.logging.INFO)
# input_fn
train_input_fn = tf.estimator.inputs.numpy_input_fn({"x": x_train, "weight": np.array(train_weight, np.float32)}, y_train, batch_size=32, num_epochs=None, shuffle=True)
eval_input_fn = tf.estimator.inputs.numpy_input_fn({"x": x_eval, "weight": np.array(eval_weight, np.float32)}, y_eval, batch_size=32, shuffle=False)
# 设置save_checkpoints_steps才会进行evaluation(只有save checkpoint的时候才会evaluation)
run_config = tf.estimator.RunConfig(save_checkpoints_steps=2000, keep_checkpoint_max=5, log_step_count_steps=4000)
estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir="output", config=run_config)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=30000)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=None, start_delay_secs=0, throttle_secs=0)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
res = list(estimator.predict(input_fn=eval_input_fn))
out = [label[item['y']] for item in res]
cnt, total = 0, 0
for i, item in enumerate(res):
# if y_dev_pre[i] == label2id["shift"]:
# continue
if item['y'] == label2id["shift"]:
continue
if y_dev_pre[i] == item['y']:
cnt += 1
total += 1
print(cnt, total)
print("acc: ", 1.0*cnt/total)
for i in range(len(out)):
print(i, out[i])
if i > 100:
break
# print(out)
# print(res)
# print(len(res))
# estimator.export_savedmodel("output/model_fn", serving_input_fn)