山东大学2019级软件工程应用与实践——基于人工智能的多肽药物分析问题(八)

2021SC@SDUSC

基于人工智能的多肽药物分析问题

主题:蛋白质预训练模型(2)
代码分析

在这里插入图片描述
根据使用方法,将按照以上顺序对每部分代码进行分析。

特征提取 : Embedding Section

ProtTrans/Embedding/Onnx/ProtBert-BFD.ipynb

ONNX简介:
Open Neural Network Exchange(ONNX,开放神经网络交换)格式,是一个用于表示深度学习模型的标准,可使模型在不同框架之间进行转移。ONNX是一种针对机器学习所设计的开放式的文件格式,用于存储训练好的模型。它使得不同的人工智能框架(如Pytorch, MXNet)可以采用相同格式存储模型数据并交互。

导入依赖库

import torch
from transformers import BertModel, BertTokenizer, pipeline
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

import numpy as np
import os
from tqdm.auto import tqdm
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import re

from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange

from os import environ
from psutil import cpu_count

# optimize transformer-based models with onnxruntime-tools
from onnxruntime_tools import optimizer
# This is optional if you need more optimization for bert based models
from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers

创建一些数据

proteins = ["VKAPHEHEHEDNPSVSRVFNRLSQARQNARSIIWGGDGELLTGLIRMVLQVTSPDDKKLSQRDEDGPRLNNCIFTIGFLVPYISEKLKALVTLEVGKPAWTAGDRRKVMGHCADEQIRLILTCAFVGYHTAYKEKFTSEAGSELMGLAFDKYDESYTVEMKKITPFGSSRLVIVLLQFNMQADNAPANAPLAVQSIYGIT",
                 "TYGASGLERPGYIIDSAQLSLRYGADADRAARFPNISAMRVGTLCTLGSLPKQTTNPINFQKIVDSGAPKDDGDGWERKWLTGCIANPYTILGTPTSGEGKDQRPQDGSRVNISEHLQTV PAAVQSQGESWEVGEEMKLKRESIIRVQPELTLSILGDKQARTLPAHSMKKSLEPAARVIMNLLPRRHEKVFCEPTDRRAQELIELAMERNNYDMLELSPHAPDTPRLAVEEAVAAINARLAFVGLGDRGNESGLFYVVDKGDKFDDEYSSYINIVGIQKLWKGSEVLKWVSGVGSMTHNEIRPGTCCNEPQDMFIQELVHLTTLFNGTVRGGVKDFKSLTFSALLEAEAVSAKSIFTRLRLHMKIYLYNISHDHFSADVLDQYKLGFAAEYKHRAALPENKKLTLDLYSAFFSTEPQSIGANIVMEASKGQVPITTMLIINKRISVLGAALALQVELKNFYNNRLRVFDQGEYPCAKLKVLWFTMEHHGCDIFVKVFVTAPDVQDEVAIVS",
                 "PSKSLKCTGGNAFALRFIRQDDAEVAPLGVSEIWLNDIGMKHELYQTVRT",
                 "IKNEIVHTVFSTLQPRDHLNGLRLQIEEATCDALRVPLGAGGLVAEEVSKKRPALYDANYFEDVLKTAGIFSPRVSGRADMEKQGFNPKMTSLQSYWSII"]

对上面的数据进行处理
用空格分隔,将UZOB替换为X

proteins = [" ".join(list(item)) for item in proteins]
proteins = [re.sub(r"[UZOB]", "X", sequence) for sequence in proteins]

导入分词器

tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False)

pytorch转换为onnx

convert(framework="pt",
        model="Rostlab/prot_bert_bfd",
        output=Path("onnx/prot_bert_bfd.onnx"),
        opset=12,
        tokenizer="Rostlab/prot_bert_bfd")

优化onnx模型
设置参数

# disable embedding layer norm optimization for better model size reduction
opt_options = BertOptimizationOptions('bert')
opt_options.enable_embed_layer_norm = False

opt_model = optimizer.optimize_model(
    'onnx/prot_bert_bfd.onnx',
    'bert', 
    num_heads=16,
    hidden_size=1024,
    optimization_options=opt_options)
# This is optional if you need to run the model using mixed precisionn
opt_model.convert_model_float32_to_float16()

opt_model.save_model_to_file('onnx/prot_bert_bfd.opt.onnx')

Embed with onnx

def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: 
  
  assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

  # Few properties that might have an impact on performances (provided by MS)
  options = SessionOptions()
  options.intra_op_num_threads = 1
  options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

  #以图形式加载模型并准备CPU后端 
  session = InferenceSession(model_path, options, providers=[provider])
  session.disable_fallback()
    
  return session


@contextmanager
def track_infer_time(buffer: [int]):
    start = time()
    yield
    end = time()

    buffer.append(end - start)


@dataclass
class OnnxInferenceResult:
  model_inference_time: [int]  
  optimized_model_path: str

ONNX 图优化方法
ONNX实时提供了各种图形优化来提高模型性能。图优化本质上是图级别的转换,从小型图简化和节点消除,到更复杂的节点融合和布局优化。

图优化级别Graph Optimization Levels
图形优化分为三个级别:

  • Basic 基础级别
  • Extended 扩展级别
  • Layout Optimizations 布局优化

在应用当前级别的优化之前,会执行当前级别之前的优化(例如我们准备执行extended优化,Basic级别的优化会在执行extended优化之前先执行)。

totalDuration = 0

for itr in range(50):
  for batch in proteins:
    maxLen = len(max(batch, key=len))/2
    minLen = len(min(batch, key=len))/2
    batch_size = len(batch)

    time1 = time()

    # 输入通过numpy数组提供
    model_inputs = tokenizer(batch, return_tensors="pt", padding=True, add_special_tokens=True)
    inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}

    # 运行模型 (None = get all the outputs)
    onnx_embedding, onnx_pooled = gpu_model.run(None, inputs_onnx)

    #embedding = model(batch)
    time2 = time()

    duration = time2-time1
    totalDuration += duration
    print('{:s} model took {:.3f} ms for sequence length between {:.3f} and {:3f} of total sequences {:d}'.format('ProtBert-BFD', duration*1000.0,minLen,maxLen,batch_size))

inference_time["Onnx FP16"] = totalDuration*1000.0
print('Total duration is {:.3f}'.format(totalDuration*1000.0))

Embed with pytorch

model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model = model.eval()

totalDuration = 0

for itr in range(50):
  for batch in proteins:
    #batch = list_part[0]
    maxLen = len(max(batch, key=len))/2
    minLen = len(min(batch, key=len))/2
    batch_size = len(batch)

    time1 = time()

    #ids = tokenizer.encode_plus(batch, return_tensors="pt", add_special_tokens=False, pad_to_max_length=True)
    ids = tokenizer(batch, return_tensors="pt", padding=True, add_special_tokens=True)
    input_ids = torch.tensor(ids['input_ids']).to(device)
    with torch.no_grad():
      pytorch_embedding, pytorch_pool = model(input_ids=input_ids)

    #embedding = model(batch)
    time2 = time()

    duration = time2-time1
    totalDuration += duration
    print('{:s} model took {:.3f} ms for sequence length between {:.3f} and {:3f} of total sequences {:d}'.format('ProtBert-BFD', duration*1000.0,minLen,maxLen,batch_size))

inference_time["Pytorch FP32"] = totalDuration*1000.0
print('Total duration is {:.3f}'.format(totalDuration*1000.0))

计算耗费时长

model_half = model.half()
totalDuration = 0

for itr in range(50):
  for batch in proteins:
    #batch = list_part[0]
    maxLen = len(max(batch, key=len))/2
    minLen = len(min(batch, key=len))/2
    batch_size = len(batch)

    time1 = time()

    #ids = tokenizer.encode_plus(batch, return_tensors="pt", add_special_tokens=False, pad_to_max_length=True)
    ids = tokenizer(batch, return_tensors="pt", padding=True, add_special_tokens=True)
    input_ids = torch.tensor(ids['input_ids']).to(device)
    with torch.no_grad():
      pytorch_embedding_half, pytorch_pool_half = model_half(input_ids=input_ids)

    #embedding = model(batch)
    time2 = time()

    duration = time2-time1
    totalDuration += duration
    print('{:s} model took {:.3f} ms for sequence length between {:.3f} and {:3f} of total sequences {:d}'.format('ProtBert-BFD', duration*1000.0,minLen,maxLen,batch_size))

inference_time["Pytorch FP16"] = totalDuration*1000.0
print('Total duration is {:.3f}'.format(totalDuration*1000.0))

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值