2021SC@SDUSC
基于人工智能的多肽药物分析问题
主题:蛋白质预训练模型(3)
代码分析
Fine Tuning Section
ProtTrans/Fine-Tuning/ProtBert-BFD-FineTune-SS3.ipynb
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForTokenClassification, BertTokenizerFast, EvalPrediction
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import re
选择你想要进行fine-tune的模型
model_name = 'Rostlab/prot_bert_bfd'
下载 Netsurfp 数据集
def downloadNetsurfpDataset():
netsurfpDatasetTrainUrl = 'https://www.dropbox.com/s/98hovta9qjmmiby/Train_HHblits.csv?dl=1'
casp12DatasetValidUrl = 'https://www.dropbox.com/s/te0vn0t7ocdkra7/CASP12_HHblits.csv?dl=1'
cb513DatasetValidUrl = 'https://www.dropbox.com/s/9mat2fqqkcvdr67/CB513_HHblits.csv?dl=1'
ts115DatasetValidUrl = 'https://www.dropbox.com/s/68pknljl9la8ax3/TS115_HHblits.csv?dl=1'
datasetFolderPath = "dataset/"
trainFilePath = os.path.join(datasetFolderPath, 'Train_HHblits.csv')
casp12testFilePath = os.path.join(datasetFolderPath, 'CASP12_HHblits.csv')
cb513testFilePath = os.path.join(datasetFolderPath, 'CB513_HHblits.csv')
ts115testFilePath = os.path.join(datasetFolderPath, 'TS115_HHblits.csv')
combinedtestFilePath = os.path.join(datasetFolderPath, 'Validation_HHblits.csv')
if not os.path.exists(datasetFolderPath):
os.makedirs(datasetFolderPath)
def download_file(url, filename):
response = requests.get(url, stream=True)
with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1,
total=int(response.headers.get('content-length', 0)),
desc=filename) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)
将数据集保存下来
if not os.path.exists(trainFilePath):
download_file(netsurfpDatasetTrainUrl, trainFilePath)
if not os.path.exists(casp12testFilePath):
download_file(casp12DatasetValidUrl, casp12testFilePath)
if not os.path.exists(cb513testFilePath):
download_file(cb513DatasetValidUrl, cb513testFilePath)
if not os.path.exists(ts115testFilePath):
download_file(ts115DatasetValidUrl, ts115testFilePath)
if not os.path.exists(combinedtestFilePath):
#combine all test dataset files
combined_csv = pd.concat([pd.read_csv(f) for f in [casp12testFilePath,cb513testFilePath,ts115testFilePath] ])
#export to csv
combined_csv.to_csv( os.path.join(datasetFolderPath, "Validation_HHblits.csv"),
index=False,
encoding='utf-8-sig')
将数据集加载到内存中
定义 load_dataset 函数,输入为数据集所在路径和放入seqs, labels, disorder的数据最大条数。
对数据集中的数据进行处理,包括字符串分割、input字段中[UZOB]替换为X。
输出为seqs, labels, disorder
def load_dataset(path, max_length):
df = pd.read_csv(path,names=['input','dssp3','dssp8','disorder','cb513_mask'],skiprows=1)
df['input_fixed'] = ["".join(seq.split()) for seq in df['input']]
df['input_fixed'] = [re.sub(r"[UZOB]", "X", seq) for seq in df['input_fixed']]
seqs = [ list(seq)[:max_length-2] for seq in df['input_fixed']]
df['label_fixed'] = ["".join(label.split()) for label in df['dssp3']]
labels = [ list(label)[:max_length-2] for label in df['label_fixed']]
df['disorder_fixed'] = [" ".join(disorder.split()) for disorder in df['disorder']]
disorder = [ disorder.split()[:max_length-2] for disorder in df['disorder_fixed']]
assert len(seqs) == len(labels) == len(disorder)
return seqs, labels, disorder
给出max_length,使用上面定义的load_dataset方法将数据载入。
max_length = 1024
train_seqs, train_labels, train_disorder = load_dataset('dataset/Train_HHblits.csv', max_length)
val_seqs, val_labels, val_disorder = load_dataset('dataset/Validation_HHblits.csv', max_length)
casp12_test_seqs, casp12_test_labels, casp12_test_disorder = load_dataset('dataset/CASP12_HHblits.csv', max_length)
cb513_test_seqs, cb513_test_labels, cb513_test_disorder = load_dataset('dataset/CB513_HHblits.csv', max_length)
ts115_test_seqs, ts115_test_labels, ts115_test_disorder = load_dataset('dataset/TS115_HHblits.csv', max_length)
print(train_seqs[0][10:30], train_labels[0][10:30], train_disorder[0][10:30], sep='\n')
运行结果:
[‘Q’, ‘I’, ‘S’, ‘F’, ‘V’, ‘K’, ‘S’, ‘H’, ‘F’, ‘S’, ‘R’, ‘Q’, ‘L’, ‘E’, ‘E’, ‘R’, ‘L’, ‘G’, ‘L’, ‘I’]
[‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘H’, ‘C’, ‘E’, ‘E’]
[‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’, ‘1.0’]
对序列进行分词
seq_tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=False)
train_seqs_encodings = seq_tokenizer(train_seqs, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
val_seqs_encodings = seq_tokenizer(val_seqs, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
casp12_test_seqs_encodings = seq_tokenizer(casp12_test_seqs, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
cb513_test_seqs_encodings = seq_tokenizer(cb513_test_seqs, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
ts115_test_seqs_encodings = seq_tokenizer(ts115_test_seqs, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True)
使用到了BertTokenizerFast模块
BertTokenizer 有以下常用方法:
-
from_pretrained:从包含词表文件(vocab.txt)的目录中初始化一个分词器;
-
tokenize:将文本(词或者句子)分解为子词列表;
-
convert_tokens_to_ids:将子词列表转化为子词对应下标的列表;
-
convert_ids_to_tokens :与上一个相反;
-
convert_tokens_to_string:将 subword 列表按“##”拼接回词或者句子;
-
encode:对于单个句子输入,分解词并加入特殊词形成“[CLS], x, [SEP]”的结构并转换为词表对应下标的列表;对于两个句子输入(多个句子只取前两个),分解词并加入特殊词形成“[CLS], x1, [SEP], x2, [SEP]”的结构并转换为下标列表;
-
decode:可以将 encode 方法的输出变为完整句子。
更多可参考:https://www.csdn.net/tags/MtTaEg5sMzI0NTI1LWJsb2cO0O0O.html
对标签进行分词
# Consider each label as a tag for each token
unique_tags = set(tag for doc in train_labels for tag in doc)
unique_tags = sorted(list(unique_tags)) # make the order of the labels unchanged
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
定义encode_tags函数
def encode_tags(tags, encodings):
labels = [[tag2id[tag] for tag in doc] for doc in tags]
encoded_labels = []
for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
# create an empty array of -100
doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
arr_offset = np.array(doc_offset)
# set labels whose first offset position is 0 and the second is not 0
doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
train_labels_encodings = encode_tags(train_labels, train_seqs_encodings)
val_labels_encodings = encode_tags(val_labels, val_seqs_encodings)
casp12_test_labels_encodings = encode_tags(casp12_test_labels, casp12_test_seqs_encodings)
cb513_test_labels_encodings = encode_tags(cb513_test_labels, cb513_test_seqs_encodings)
ts115_test_labels_encodings = encode_tags(ts115_test_labels, ts115_test_seqs_encodings)
Mask disorder tokens
mask(掩码、掩膜)是深度学习中的常见操作。简单而言,其相当于在原始张量上盖上一层掩膜,从而屏蔽或选择一些特定元素,因此常用于构建张量的过滤器(见下图)。
def mask_disorder(labels, masks):
for label, mask in zip(labels,masks):
for i, disorder in enumerate(mask):
if disorder == "0.0":
#shift by one because of the CLS token at index 0
label[i+1] = -100
mask_disorder(train_labels_encodings, train_disorder)
mask_disorder(val_labels_encodings, val_disorder)
mask_disorder(casp12_test_labels_encodings, casp12_test_disorder)
mask_disorder(cb513_test_labels_encodings, cb513_test_disorder)
mask_disorder(ts115_test_labels_encodings, ts115_test_disorder)
创建SS3数据集
定义 SS3Dataset 类
其中有三个函数,分别为初始化、__ getitem__和获取长度函数。
class SS3Dataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
使用上述函数得到新的dataset
train_dataset = SS3Dataset(train_seqs_encodings, train_labels_encodings)
val_dataset = SS3Dataset(val_seqs_encodings, val_labels_encodings)
casp12_test_dataset = SS3Dataset(casp12_test_seqs_encodings, casp12_test_labels_encodings)
cb513_test_dataset = SS3Dataset(cb513_test_seqs_encodings, cb513_test_labels_encodings)
ts115_test_dataset = SS3Dataset(ts115_test_seqs_encodings, ts115_test_labels_encodings)
定义评估指标
此处定义了两个函数,分别为校准预测和计算衡量指标。
align_predictions使用交叉熵损失函数CrossEntropyLoss()
交叉熵主要是用来判定实际的输出与期望的输出的接近程度,用它来衡量网络的输出与标签的差异,利用这种差异经过反向传播去更新网络参数。
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray):
preds = np.argmax(predictions, axis=2)
batch_size, seq_len = preds.shape
out_label_list = [[] for _ in range(batch_size)]
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
for j in range(seq_len):
if label_ids[i, j] != torch.nn.CrossEntropyLoss().ignore_index:
out_label_list[i].append(id2tag[label_ids[i][j]])
preds_list[i].append(id2tag[preds[i][j]])
return preds_list, out_label_list
def compute_metrics(p: EvalPrediction):
preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
return {
"accuracy": accuracy_score(out_label_list, preds_list),
"precision": precision_score(out_label_list, preds_list),
"recall": recall_score(out_label_list, preds_list),
"f1": f1_score(out_label_list, preds_list),
}
创建模型
def model_init():
return AutoModelForTokenClassification.from_pretrained(model_name,
num_labels=len(unique_tags),
id2label=id2tag,
label2id=tag2id,
gradient_checkpointing=False)
定义训练参数,开始训练
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=5, # total number of training epochs
per_device_train_batch_size=1, # batch size per device during training
per_device_eval_batch_size=8, # batch size for evaluation
warmup_steps=200, # number of warmup steps for learning rate scheduler
learning_rate=3e-05, # learning rate
weight_decay=0.0, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=200, # How often to print logs
do_train=True, # Perform training
do_eval=True, # Perform evaluation
evaluation_strategy="epoch", # evalute after each epoch
gradient_accumulation_steps=32, # total number of steps before back propagation
fp16=True, # Use mixed precision
fp16_opt_level="02", # mixed precision mode
run_name="ProBert-BFD-SS3", # experiment name
seed=3, # Seed for experiment reproducibility
load_best_model_at_end=True,
metric_for_best_model="eval_accuracy",
greater_is_better=True,
)
trainer = Trainer(
model_init=model_init, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
compute_metrics = compute_metrics, # evaluation metrics
)
trainer.train()
做预测并进行评估
predictions, label_ids, metrics = trainer.predict(casp12_test_dataset)
metrics
运行结果:
metrics
{‘eval_accuracy’: 0.7556149732620321,
‘eval_f1’: 0.5668971126474176,
‘eval_loss’: 0.5859289169311523,
‘eval_precision’: 0.584241408214585,
‘eval_recall’: 0.5505529225908373,
‘eval_runtime’: 2.3255,
‘eval_samples_per_second’: 8.6}
最后保存模型。