2021SC@SDUSC
基于人工智能的多肽药物分析问题
主题:蛋白质预训练模型(2)
代码分析
根据使用方法,将按照以上顺序对每部分代码进行分析。
特征提取 : Embedding Section
ProtTrans/Embedding/Onnx/ProtBert-BFD.ipynb
ONNX简介:
Open Neural Network Exchange(ONNX,开放神经网络交换)格式,是一个用于表示深度学习模型的标准,可使模型在不同框架之间进行转移。ONNX是一种针对机器学习所设计的开放式的文件格式,用于存储训练好的模型。它使得不同的人工智能框架(如Pytorch, MXNet)可以采用相同格式存储模型数据并交互。
导入依赖库
import torch
from transformers import BertModel, BertTokenizer, pipeline
from pathlib import Path
from transformers.convert_graph_to_onnx import convert
import numpy as np
import os
from tqdm.auto import tqdm
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import re
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange
from os import environ
from psutil import cpu_count
# optimize transformer-based models with onnxruntime-tools
from onnxruntime_tools import optimizer
# This is optional if you need more optimization for bert based models
from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers
创建一些数据
proteins = ["VKAPHEHEHEDNPSVSRVFNRLSQARQNARSIIWGGDGELLTGLIRMVLQVTSPDDKKLSQRDEDGPRLNNCIFTIGFLVPYISEKLKALVTLEVGKPAWTAGDRRKVMGHCADEQIRLILTCAFVGYHTAYKEKFTSEAGSELMGLAFDKYDESYTVEMKKITPFGSSRLVIVLLQFNMQADNAPANAPLAVQSIYGIT",
"TYGASGLERPGYIIDSAQLSLRYGADADRAARFPNISAMRVGTLCTLGSLPKQTTNPINFQKIVDSGAPKDDGDGWERKWLTGCIANPYTILGTPTSGEGKDQRPQDGSRVNISEHLQTV PAAVQSQGESWEVGEEMKLKRESIIRVQPELTLSILGDKQARTLPAHSMKKSLEPAARVIMNLLPRRHEKVFCEPTDRRAQELIELAMERNNYDMLELSPHAPDTPRLAVEEAVAAINARLAFVGLGDRGNESGLFYVVDKGDKFDDEYSSYINIVGIQKLWKGSEVLKWVSGVGSMTHNEIRPGTCCNEPQDMFIQELVHLTTLFNGTVRGGVKDFKSLTFSALLEAEAVSAKSIFTRLRLHMKIYLYNISHDHFSADVLDQYKLGFAAEYKHRAALPENKKLTLDLYSAFFSTEPQSIGANIVMEASKGQVPITTMLIINKRISVLGAALALQVELKNFYNNRLRVFDQGEYPCAKLKVLWFTMEHHGCDIFVKVFVTAPDVQDEVAIVS",
"PSKSLKCTGGNAFALRFIRQDDAEVAPLGVSEIWLNDIGMKHELYQTVRT",
"IKNEIVHTVFSTLQPRDHLNGLRLQIEEATCDALRVPLGAGGLVAEEVSKKRPALYDANYFEDVLKTAGIFSPRVSGRADMEKQGFNPKMTSLQSYWSII"]
对上面的数据进行处理
用空格分隔,将UZOB替换为X
proteins = [" ".join(list(item)) for item in proteins]
proteins = [re.sub(r"[UZOB]", "X", sequence) for sequence in proteins]
导入分词器
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False)
pytorch转换为onnx
convert(framework="pt",
model="Rostlab/prot_bert_bfd",
output=Path("onnx/prot_bert_bfd.onnx"),
opset=12,
tokenizer="Rostlab/prot_bert_bfd")
优化onnx模型
设置参数
# disable embedding layer norm optimization for better model size reduction
opt_options = BertOptimizationOptions('bert')
opt_options.enable_embed_layer_norm = False
opt_model = optimizer.optimize_model(
'onnx/prot_bert_bfd.onnx',
'bert',
num_heads=16,
hidden_size=1024,
optimization_options=opt_options)
# This is optional if you need to run the model using mixed precisionn
opt_model.convert_model_float32_to_float16()
opt_model.save_model_to_file('onnx/prot_bert_bfd.opt.onnx')
Embed with onnx
def create_model_for_provider(model_path: str, provider: str) -> InferenceSession:
assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"
# Few properties that might have an impact on performances (provided by MS)
options = SessionOptions()
options.intra_op_num_threads = 1
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
#以图形式加载模型并准备CPU后端
session = InferenceSession(model_path, options, providers=[provider])
session.disable_fallback()
return session
@contextmanager
def track_infer_time(buffer: [int]):
start = time()
yield
end = time()
buffer.append(end - start)
@dataclass
class OnnxInferenceResult:
model_inference_time: [int]
optimized_model_path: str
ONNX 图优化方法
ONNX实时提供了各种图形优化来提高模型性能。图优化本质上是图级别的转换,从小型图简化和节点消除,到更复杂的节点融合和布局优化。
图优化级别Graph Optimization Levels
图形优化分为三个级别:
- Basic 基础级别
- Extended 扩展级别
- Layout Optimizations 布局优化
在应用当前级别的优化之前,会执行当前级别之前的优化(例如我们准备执行extended优化,Basic级别的优化会在执行extended优化之前先执行)。
totalDuration = 0
for itr in range(50):
for batch in proteins:
maxLen = len(max(batch, key=len))/2
minLen = len(min(batch, key=len))/2
batch_size = len(batch)
time1 = time()
# 输入通过numpy数组提供
model_inputs = tokenizer(batch, return_tensors="pt", padding=True, add_special_tokens=True)
inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
# 运行模型 (None = get all the outputs)
onnx_embedding, onnx_pooled = gpu_model.run(None, inputs_onnx)
#embedding = model(batch)
time2 = time()
duration = time2-time1
totalDuration += duration
print('{:s} model took {:.3f} ms for sequence length between {:.3f} and {:3f} of total sequences {:d}'.format('ProtBert-BFD', duration*1000.0,minLen,maxLen,batch_size))
inference_time["Onnx FP16"] = totalDuration*1000.0
print('Total duration is {:.3f}'.format(totalDuration*1000.0))
Embed with pytorch
model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model = model.eval()
totalDuration = 0
for itr in range(50):
for batch in proteins:
#batch = list_part[0]
maxLen = len(max(batch, key=len))/2
minLen = len(min(batch, key=len))/2
batch_size = len(batch)
time1 = time()
#ids = tokenizer.encode_plus(batch, return_tensors="pt", add_special_tokens=False, pad_to_max_length=True)
ids = tokenizer(batch, return_tensors="pt", padding=True, add_special_tokens=True)
input_ids = torch.tensor(ids['input_ids']).to(device)
with torch.no_grad():
pytorch_embedding, pytorch_pool = model(input_ids=input_ids)
#embedding = model(batch)
time2 = time()
duration = time2-time1
totalDuration += duration
print('{:s} model took {:.3f} ms for sequence length between {:.3f} and {:3f} of total sequences {:d}'.format('ProtBert-BFD', duration*1000.0,minLen,maxLen,batch_size))
inference_time["Pytorch FP32"] = totalDuration*1000.0
print('Total duration is {:.3f}'.format(totalDuration*1000.0))
计算耗费时长
model_half = model.half()
totalDuration = 0
for itr in range(50):
for batch in proteins:
#batch = list_part[0]
maxLen = len(max(batch, key=len))/2
minLen = len(min(batch, key=len))/2
batch_size = len(batch)
time1 = time()
#ids = tokenizer.encode_plus(batch, return_tensors="pt", add_special_tokens=False, pad_to_max_length=True)
ids = tokenizer(batch, return_tensors="pt", padding=True, add_special_tokens=True)
input_ids = torch.tensor(ids['input_ids']).to(device)
with torch.no_grad():
pytorch_embedding_half, pytorch_pool_half = model_half(input_ids=input_ids)
#embedding = model(batch)
time2 = time()
duration = time2-time1
totalDuration += duration
print('{:s} model took {:.3f} ms for sequence length between {:.3f} and {:3f} of total sequences {:d}'.format('ProtBert-BFD', duration*1000.0,minLen,maxLen,batch_size))
inference_time["Pytorch FP16"] = totalDuration*1000.0
print('Total duration is {:.3f}'.format(totalDuration*1000.0))