bert部署
服务端
保存模型
"""=========================EXPORT MODEL========================"""
def serving_input_receiver_fn():
"""An input receiver that expects a serialized tf.Example."""
reciever_tensors = {
"input_ids": tf.placeholder(dtype=tf.int64,
shape=[1, FLAGS.max_seq_length])
}
features = {
"input_ids": reciever_tensors['input_ids'],
"input_mask": 1 - tf.cast(tf.equal(reciever_tensors['input_ids'], 0), dtype=tf.int64),
"segment_ids": tf.zeros(dtype=tf.int64,
shape=[1, FLAGS.max_seq_length]),
'label_ids': tf.zeros(dtype=tf.int64, shape=[1, 1])
}
return tf.estimator.export.ServingInputReceiver(features, reciever_tensors)
estimator._export_to_tpu = False
estimator.export_savedmodel(FLAGS.export_model_dir,serving_input_receiver_fn)
"""=========================EXPORT MODEL========================"""
这里定义了feature的格式是由这几个key组成的,reciever_tensors是服务端接受的格式
运行服务端
docker run --runtime=nvidia \
-e CUDA_VISIBLE_DEVICES=1 \
-p 8700:8500 -p 8701:8501 \
--mount type=bind,source=~/saved_model,target=/models/qing\
-e MODEL_NAME=qing \
-t tensorflow/serving:latest-gpu
客户端
http方式连接
import tokenization
from extract_features import InputExample, convert_examples_to_features
import numpy as np
import requests
import os
import time
vocab_file = os.environ.get('vocab_file', '~/bert_model/chinese_L-12_H-768_A-12/vocab.txt')
max_token_len = os.environ.get('max_token_len', 128)
def preprocess(text):
text_a = text
example = InputExample(unique_id=None, text_a=text_a, text_b=None)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=True)
feature = convert_examples_to_features([example], max_token_len, tokenizer)[0]
input_ids = np.reshape([feature.input_ids], (1, max_token_len))
return {
"inputs": {"input_ids": input_ids.tolist()}
}
if __name__ == '__main__':
while True:
text = input("Input test sentence:\n")
start = time.time()
resp = requests.post('http://localhost:8701/v1/models/qing:predict', json=preprocess(text))
end = time.time()
pro_0, pro_1 = resp.json()['outputs'][0]
print(f"negative pro:{pro_0} positive pro:{pro_1} time consuming:{int((end - start) * 1000)}ms")
grpc方式连接
import numpy as np
import grpc
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
import time
import tokenization
from extract_features import InputExample, convert_examples_to_features
import numpy as np
import requests
import os
import time
vocab_file = os.environ.get('vocab_file', '~/bert_model/chinese_L-12_H-768_A-12/vocab.txt')
max_token_len = os.environ.get('max_token_len', 128)
def preprocess(text):
text_a = text
example = InputExample(unique_id=None, text_a=text_a, text_b=None)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=True)
feature = convert_examples_to_features([example], max_token_len, tokenizer)[0]
input_ids = np.reshape([feature.input_ids], (1, max_token_len))
return {
"inputs": {"input_ids": input_ids.tolist()}
}
def send(txt='你好啊'):
channel = grpc.insecure_channel('localhost:8700')
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'qing'
a = preprocess(txt)
ids = a['inputs']['input_ids']
i = np.array(ids, dtype=np.int64)
request.inputs['input_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(i))
result = stub.Predict(request)
return result
if __name__ == '__main__':
channel = grpc.insecure_channel('localhost:8700')
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'qing'
text = '你好啊'
a = preprocess(text)
ids = a['inputs']['input_ids']
i = np.array(ids, dtype=np.int64)
#i = np.zeros([1, 128], dtype=np.int64)
request.inputs['input_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(i))
result = stub.Predict(request)
print(result)
版本维护和模型热更新
刚才我们将模型保存在了"model/1"中,其中1是模型的版本号。如果我们的算法工程师研发出了更好的模型,此时我们并不需要将TensorFlow Serving重启,只需要将新模型发布在"model/新版本号"中,如"model/2"。TensorFlow Serving就会自动发布新版本的模型,客户端也可以请求新版本对应的API了。