Triton的21.08开始,支持BLS功能,将model serving的和模型交互的代码,放入triton的python backend中。
闲聊模型采用GPT模型,每次推理只能获取一个字,需要反复循环,且每次返回的向量非常多(网络传输时间消耗大),因此,将这部分逻辑放到triton的BLS中,是非常合适的。
参考代码:
config.pbtxt
name: "ibuddha_chitchat_bls"
backend: "python"
max_batch_size: 64
#{
#name: "INPUT_TEXT"
#data_type: TYPE_STRING
#dims: [ 1 ]
#}
#]
input [
{
name: "INPUT__0"
data_type: TYPE_INT64
dims: [ -1 ]
}
]
input [
{
name: "INPUT__1"
data_type: TYPE_INT64
dims: [ -1 ]
}
]
input [
{
name: "INPUT__2"
data_type: TYPE_INT64
dims: [ -1 ]
}
]
output [
{
name: "OUTPUT__0"
data_type: TYPE_INT32
dims: [ -1 ]
}
]
output [
{
name: "OUTPUT__1"
data_type: TYPE_FP32
dims: [ -1 ]
}
]
instance_group [{ kind: KIND_CPU }]
dynamic_batching {
}
model.py
import triton_python_backend_utils as pb_utils
from torch.utils.dlpack import from_dlpack,to_dlpack
import torch.nn.functional as F
import torch
import json
import numpy as np
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to intialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
# You must parse model_config. JSON string is not parsed here
self.model_config = json.loads(args['model_config'])
input0_config = pb_utils.get_input_config_by_name(
self.model_config, "INPUT__0")
input1_config = pb_utils.get_input_config_by_name(
self.model_config, "INPUT__1")
input2_config = pb_utils.get_input_config_by_name(
self.model_config, "INPUT__2")
output0_config = pb_utils.get_output_config_by_name(
self.model_config, "OUTPUT__0")
output1_config = pb_utils.get_output_config_by_name(
self.model_config, "OUTPUT__1")
# Convert Triton types to numpy types
self.input0_dtype = pb_utils.triton_string_to_numpy(
input0_config['data_type'])
self.input1_dtype = pb_utils.triton_string_to_numpy(
input1_config['data_type'])
self.input2_dtype = pb_utils.triton_string_to_numpy(
input2_config['data_type'])
self.output0_dtype = pb_utils.triton_string_to_numpy(
output0_config['data_type'])
self.output1_dtype = pb_utils.triton_string_to_numpy(
output1_config['data_type'])
#self.cls, self.sep, self.pad, self.speaker1, self.speaker2 = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", "[PAD]", "[speaker1]", "[speaker2]"])
#self.special_tokens_ids = [self.cls, self.sep, self.pad, self.speaker1, self.speaker2]
self.special_tokens_ids = [0, 2, 1, 13086, 13087]
self.output_min_length = 1
self.output_max_length = 64 #TODO: change
self.temperature = 0.7
self.top_p = 0.7
self.round = 1
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference request is made
for this model. Depending on the batching configuration (e.g. Dynamic
Batching) used, `requests` may contain multiple requests. Every
Python model, must create one pb_utils.InferenceResponse for every
pb_utils.InferenceRequest in `requests`. If there is an error, you can
set the error argument when creating a pb_utils.InferenceResponse
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses = []
# Every Python backend must iterate over everyone of the requests
# and create a pb_utils.InferenceResponse for each of them.
for request in requests:
# Get INPUT0
in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT__0")
in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT__1")
in_2 = pb_utils.get_input_tensor_by_name(request, "INPUT__2")
#pytorch_tensor = from_dlpack(in_0.to_dlpack())
#print(pytorch_tensor)
# Get Model Name
#model_name = pb_utils.get_input_tensor_by_name(
# request, "MODEL_NAME")
# Model Name string
#model_name_string = model_name.as_numpy()[0]
model_name_string = "ibuddha_chitchat"
# Create inference request object
# Perform synchronous blocking inference request
# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
# Below is an example of how you can set errors in inference
# response:
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occured"))
#
# Because the infer_response of the models contains the final
# outputs with correct output names, we can just pass the list
# of outputs to the InferenceResponse object.
#print(type(infer_response))
output_ids = []
output_confidences = []
for i in range(self.output_max_length):
infer_request = pb_utils.InferenceRequest(
model_name=model_name_string,
requested_output_names=["OUTPUT__0"],
inputs=[in_0, in_1, in_2])
infer_response = infer_request.exec()
if infer_response.has_error():
raise pb_utils.TritonModelException(
infer_response.error().message())
output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT__0')
#_logits = output0.as_numpy()
#logits = torch.from_numpy(np.array(_logits))
logits = from_dlpack(output0.to_dlpack())
#print(pytorch_tensor)
#_logits = self.triton_infer(encoded_input)[0]
#logits = torch.from_numpy(np.array(_logits))
logits = logits[0, :] / self.temperature
top_logits = self.top_filtering(logits, self.top_p)
probs = F.softmax(top_logits, dim=-1)
prev = torch.multinomial(probs, num_samples=1)
if i < self.output_min_length and prev.item() in self.special_tokens_ids:
while prev.item() in self.special_tokens_ids:
prev = torch.multinomial(probs, num_samples=1)
output_id = prev.item()
if output_id in self.special_tokens_ids:
break
output_ids.append(output_id)
output_confidences.append(probs[output_id].item())
input_ids = torch.from_numpy(in_0.as_numpy())
attention_mask = torch.from_numpy(in_1.as_numpy())
token_type_ids = torch.from_numpy(in_2.as_numpy())
#input_ids = from_dlpack(in_0.to_dlpack())
#attention_mask = from_dlpack(in_1.to_dlpack())
#token_type_ids = from_dlpack(in_2.to_dlpack())
input_ids = torch.cat((input_ids, torch.LongTensor([[output_id]])), 1)
attention_mask = torch.cat((attention_mask, torch.LongTensor([[1]])), 1)
token_type_ids = torch.cat((token_type_ids, torch.LongTensor([[output_id]])), 1)
in_0 = pb_utils.Tensor("INPUT__0", input_ids.numpy().astype(self.input0_dtype))
in_1 = pb_utils.Tensor("INPUT__1", attention_mask.numpy().astype(self.input1_dtype))
in_2 = pb_utils.Tensor("INPUT__2", token_type_ids.numpy().astype(self.input2_dtype))
#in_0 = pb_utils.Tensor.from_dlpack("INPUT__0", to_dlpack(input_ids))
#in_1 = pb_utils.Tensor.from_dlpack("INPUT__1", to_dlpack(attention_mask))
#in_2 = pb_utils.Tensor.from_dlpack("INPUT__2", to_dlpack(token_type_ids))
#print(infer_response.output_tensors())
output_ids = torch.tensor(output_ids)
output_confidences = torch.tensor(output_confidences)
output_0 = pb_utils.Tensor("OUTPUT__0", output_ids.numpy().astype(self.output0_dtype))
output_1 = pb_utils.Tensor("OUTPUT__1", output_confidences.numpy().astype(self.output1_dtype))
#output_0 = pb_utils.Tensor.from_dlpack("OUTPUT__0", to_dlpack(output_ids))
#output_1 = pb_utils.Tensor.from_dlpack("OUTPUT__1", to_dlpack(output_confidences))
inference_response = pb_utils.InferenceResponse(
output_tensors=[output_0, output_1])
#print(type(inference_response))
responses.append(inference_response)
# You should return a list of pb_utils.InferenceResponse. Length
# of this list must match the length of `requests` list.
return responses
def top_filtering(self, logits, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
#assert logits.dim() == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
if top_p > 0.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cumulative_probabilities > top_p
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
indices_to_remove = sorted_indices[sorted_indices_to_remove]
logits[indices_to_remove] = filter_value
indices_to_remove = logits < threshold
logits[indices_to_remove] = filter_value
return logits
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is OPTIONAL. This function allows
the model to perform any necessary clean ups before exit.
"""
print('Cleaning up...')
Dockerfile
FROM hub.yun.paic.com.cn/pib-core/triton-server:21.11-py3
RUN mkdir .pip
COPY tools/pip.conf /root/.pip/
ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN pip3 install torch
#CMD tritonserver --model-repository=/models --model-control-mode=poll --exit-on-error=false
model.py 和其他模型一样,放到triton内的子文件夹里
由于是python代码,因此涉及第三方库的问题,需要在原triton镜像的基础上新增三方库,因此,需要额外build镜像。
这里重点讲解一点:
python backend是配置的是:instance_group [{ kind: KIND_CPU }]
具体执行的模型,运行在GPU上。
因此
infer_response = infer_request.exec()
这句完成模型推理后的结果是在GPU上的,无法直接使用
必须采用pytorch的to_dlpack将GPU的内容放到共享内存中,再用from_dlpack把共享内存的内容转为pytorch的tensor。
logits = from_dlpack(output0.to_dlpack())
triton的变量转为pytorch的tensor有2种方法:
input_ids = from_dlpack(in_0.to_dlpack())
input_ids = torch.from_numpy(in_0.as_numpy())
采用to_dlpack和from_dlpack 具有更低的消耗。