bert部署

bert部署
服务端

保存模型

"""=========================EXPORT MODEL========================"""

def serving_input_receiver_fn():
    """An input receiver that expects a serialized tf.Example."""
    reciever_tensors = {
        "input_ids": tf.placeholder(dtype=tf.int64,
                                    shape=[1, FLAGS.max_seq_length])
    }
    features = {
        "input_ids": reciever_tensors['input_ids'],
        "input_mask": 1 - tf.cast(tf.equal(reciever_tensors['input_ids'], 0), dtype=tf.int64),
        "segment_ids": tf.zeros(dtype=tf.int64,
                                shape=[1, FLAGS.max_seq_length]),
        'label_ids': tf.zeros(dtype=tf.int64, shape=[1, 1])
    }
    return tf.estimator.export.ServingInputReceiver(features, reciever_tensors)

    estimator._export_to_tpu = False
    estimator.export_savedmodel(FLAGS.export_model_dir,serving_input_receiver_fn)
        """=========================EXPORT MODEL========================"""

这里定义了feature的格式是由这几个key组成的,reciever_tensors是服务端接受的格式


运行服务端

docker run --runtime=nvidia \
    -e CUDA_VISIBLE_DEVICES=1 \
    -p 8700:8500 -p 8701:8501 \
    --mount type=bind,source=~/saved_model,target=/models/qing\
    -e MODEL_NAME=qing \
    -t tensorflow/serving:latest-gpu
客户端
http方式连接
import tokenization
from extract_features import InputExample, convert_examples_to_features
import numpy as np
import requests
import os
import time

vocab_file = os.environ.get('vocab_file', '~/bert_model/chinese_L-12_H-768_A-12/vocab.txt')
max_token_len = os.environ.get('max_token_len', 128)


def preprocess(text):
    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=True)
    feature = convert_examples_to_features([example], max_token_len, tokenizer)[0]
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {
        "inputs": {"input_ids": input_ids.tolist()}
    }


if __name__ == '__main__':
    while True:
        text = input("Input test sentence:\n")
        start = time.time()
        resp = requests.post('http://localhost:8701/v1/models/qing:predict', json=preprocess(text))
        end = time.time()
        pro_0, pro_1 = resp.json()['outputs'][0]
        print(f"negative pro:{pro_0} positive pro:{pro_1} time consuming:{int((end - start) * 1000)}ms")
grpc方式连接
import numpy as np
import grpc
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc
import time


import tokenization
from extract_features import InputExample, convert_examples_to_features
import numpy as np
import requests
import os
import time

vocab_file = os.environ.get('vocab_file', '~/bert_model/chinese_L-12_H-768_A-12/vocab.txt')
max_token_len = os.environ.get('max_token_len', 128)


def preprocess(text):
    text_a = text
    example = InputExample(unique_id=None, text_a=text_a, text_b=None)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=True)
    feature = convert_examples_to_features([example], max_token_len, tokenizer)[0]
    input_ids = np.reshape([feature.input_ids], (1, max_token_len))
    return {
        "inputs": {"input_ids": input_ids.tolist()}
    }
    
def send(txt='你好啊'):
    channel = grpc.insecure_channel('localhost:8700')
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'qing'
    
    a = preprocess(txt)
    ids = a['inputs']['input_ids']
    i = np.array(ids, dtype=np.int64)
    request.inputs['input_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(i))
    result = stub.Predict(request)
    return result
    
    
if __name__ == '__main__':
    channel = grpc.insecure_channel('localhost:8700')
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'qing'
    text = '你好啊'
    a = preprocess(text)
    ids = a['inputs']['input_ids']
    i = np.array(ids, dtype=np.int64)
    #i = np.zeros([1, 128], dtype=np.int64)
    request.inputs['input_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(i))
    result = stub.Predict(request)
    print(result)



版本维护和模型热更新

刚才我们将模型保存在了"model/1"中,其中1是模型的版本号。如果我们的算法工程师研发出了更好的模型,此时我们并不需要将TensorFlow Serving重启,只需要将新模型发布在"model/新版本号"中,如"model/2"。TensorFlow Serving就会自动发布新版本的模型,客户端也可以请求新版本对应的API了。

参考
  1. aws-neuron-sdk
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值