导入第三方库包
import numpy as np
import pandas as pd
import os
import matplotlib. pyplot as plt
from sklearn. model_selection import train_test_split
from sklearn. metrics import accuracy_score
import torch
import transformers
import random
from tqdm import tqdm
device = "cuda" if torch. cuda. is_available( ) else "cpu"
D:\Environment\Anconda\envs\py388\lib\site-packages\tqdm-4.63.0-py3.8.egg\tqdm\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
torch. cuda. empty_cache( )
import os
import datetime
def printbar ( ) :
nowtime = datetime. datetime. now( ) . strftime( '%Y-%m-%d %H:%M:%S' )
print ( "\n" + "==========" * 8 + "%s" % nowtime)
os. environ[ "KMP_DUPLICATE_LIB_OK" ] = "TRUE"
数据预处理
def read_imdb_split ( split_dir) :
texts = [ ]
labels = [ ]
spilt_data= pd. read_csv( split_dir, encoding= 'utf-8' )
spilt_data. columns= [ 'label' , 'text' ]
texts= list ( spilt_data. text)
labels= list ( spilt_data. label)
return texts, labels
all_texts, all_labels = read_imdb_split( 'expri.csv' )
len ( all_texts)
125
数据分析
df= pd. read_csv( 'expri.csv' , encoding= 'utf-8' )
plt. hist( df[ 'text' ] . apply ( lambda x: min ( len ( x. split( ) ) , 1000 ) ) , bins= 20 )
plt. ylabel( "Number of texts" )
plt. xlabel( "Word count" )
print ( f"average word count: { np. mean( df[ 'text' ] . apply ( lambda x: len ( x. split( ) ) ) ) } " )
average word count: 214.544
数据处理
random_seed = 0
numpy_seed = 1
torch_seed = 2
cuda_seed = 3
random. seed( random_seed)
np. random. seed( numpy_seed)
torch. manual_seed( torch_seed)
torch. cuda. manual_seed_all( cuda_seed)
from sklearn. model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split( all_texts,
all_labels, test_size= .2 , random_state= random_seed)
train_texts, test_texts, train_labels, test_labels = train_test_split( train_texts,
train_labels, test_size= .3 , random_state= random_seed)
len ( train_texts) , len ( test_texts) , len ( val_texts)
(70, 30, 25)
模型搭建
model_path= 'D://Model//bert-base-cased'
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast. from_pretrained( model_path)
maxlen= 20
train_encodings = tokenizer( train_texts,
padding= 'max_length' ,
truncation= True ,
max_length= maxlen,
return_tensors= 'pt' )
val_encodings = tokenizer( val_texts,
padding= 'max_length' ,
truncation= True ,
max_length= maxlen,
return_tensors= 'pt' )
test_encodings = tokenizer( test_texts,
padding= 'max_length' ,
truncation= True ,
max_length= maxlen,
return_tensors= 'pt' )
all_encodings = tokenizer( all_texts,
padding= 'max_length' ,
truncation= True ,
max_length= maxlen,
return_tensors= 'pt' )
import gc
gc. collect( )
4078
class IMDbDataset ( torch. utils. data. Dataset) :
def __init__ ( self, encodings, labels) :
self. encodings = encodings
self. labels = labels
def __getitem__ ( self, idx) :
item = { key: torch. tensor( val[ idx] ) for key, val in self. encodings. items( ) }
item[ 'labels' ] = torch. tensor( self. labels[ idx] )
return item
def __len__ ( self) :
return len ( self. labels)
train_dataset = IMDbDataset( train_encodings, train_labels)
val_dataset = IMDbDataset( val_encodings, val_labels)
test_dataset = IMDbDataset( test_encodings, test_labels)
all_dataset = IMDbDataset( all_encodings, all_labels)
len ( train_labels) , len ( val_labels) , len ( test_labels)
(70, 25, 30)
from torch. utils. data import DataLoader
train_data_loader = DataLoader(
dataset = train_dataset,
batch_size= 4 ,
shuffle= True
)
val_data_loader = DataLoader(
dataset = val_dataset,
batch_size= 4 ,
shuffle= True
)
test_data_loader = DataLoader(
dataset = test_dataset,
batch_size= 4 ,
shuffle= True
)
all_data_loader = DataLoader(
dataset = all_dataset,
batch_size= 4 ,
shuffle= True
)
模型训练
from transformers import BertForSequenceClassification
model_path
'D://Model//bert-base-cased'
model = BertForSequenceClassification. from_pretrained( model_path)
Some weights of the model checkpoint at D://Model//bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at D://Model//bert-base-cased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.weight', 'classifier.bias', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
import gc
gc. collect( )
469
from sklearn. metrics import accuracy_score
from sklearn. metrics import f1_score, precision_score, recall_score, accuracy_score
loss_func = torch. nn. CrossEntropyLoss( )
optimizer = torch. optim. Adam( params= model. parameters( ) , lr = 0.01 )
metric_func = accuracy_score
metric_name = "accuracy"
def train_epoch ( tmp_model, dataloader, tmp_op, epoch) :
loss_function= torch. nn. CrossEntropyLoss( )
tmp_model. train( )
tmp_model. zero_grad( )
total_loss = 0.0
total_metric= 0.0
preds = [ ]
labels = [ ]
for step, batch in enumerate ( dataloader, 1 ) :
tmp_op. zero_grad( )
input_ids = batch[ 'input_ids' ] . to( device)
attention_mask = batch[ 'attention_mask' ] . to( device)
label = batch[ 'labels' ] . to( device)
outputs = model( input_ids, attention_mask= attention_mask, labels= label)
loss = outputs[ 0 ]
logits= outputs[ 1 ]
total_loss += float ( loss. item( ) )
pred = torch. argmax( logits, dim= - 1 )
preds. extend( pred. cpu( ) . tolist( ) )
labels. extend( batch[ 'labels' ] . cpu( ) . tolist( ) )
metric = accuracy_score( preds, labels)
total_metric += float ( metric. item( ) )
loss. requires_grad_( True )
tmp_op. step( )
return total_loss/ step, total_metric/ step
def evaluate ( tmp_model, dataloader, split, post_evaluate_hook= None ) :
preds = [ ]
labels = [ ]
tmp_model. eval ( )
val_total_loss = 0.0
val_total_metric= 0.0
with torch. no_grad( ) :
for step, batch in enumerate ( dataloader, 1 ) :
input_ids = batch[ 'input_ids' ] . to( device)
attention_mask = batch[ 'attention_mask' ] . to( device)
label = batch[ 'labels' ] . to( device)
outputs = tmp_model( input_ids, attention_mask= attention_mask, labels= label)
loss = outputs[ 0 ]
logits= outputs[ 1 ]
pred = torch. argmax( logits, dim= - 1 )
preds. extend( pred. cpu( ) . tolist( ) )
labels. extend( batch[ 'labels' ] . cpu( ) . tolist( ) )
val_total_loss += loss. item( )
metric = accuracy_score( preds, labels)
val_total_metric += float ( metric. item( ) )
tmp_model. train( )
return val_total_loss/ step, val_total_metric/ step
model. device
model= model. to( device)
model. device
model_path
epochs = 10
dfhistory = pd. DataFrame( columns = [ "epoch" , "loss" , metric_name, "val_loss" , "val_" + metric_name] )
print ( "Start Training..." )
nowtime = datetime. datetime. now( ) . strftime( '%Y-%m-%d %H:%M:%S' )
print ( "==========" * 8 + "%s" % nowtime)
for epoch in range ( 1 , epochs+ 1 ) :
train_loss, train_metric= train_epoch( model, train_data_loader, optimizer, epoch)
val_loss, val_metric= evaluate( model, val_data_loader, "Valid" )
info = ( epoch, train_loss, train_metric, val_loss, val_metric)
dfhistory. loc[ epoch- 1 ] = info
print ( ( "\nEPOCH = %d, loss = %.4f," + metric_name + \
" = %.4f, val_loss = %.4f, " + "val_" + metric_name+ " = %.4f" )
% info)
nowtime = datetime. datetime. now( ) . strftime( '%Y-%m-%d %H:%M:%S' )
print ( "\n" + "==========" * 8 + "%s" % nowtime)
print ( 'Finished Training...' )
dfhistory
模型评估
dfhistory
% matplotlib inline
% config InlineBackend. figure_format = 'svg'
import matplotlib. pyplot as plt
def plot_metric ( dfhistory, metric) :
train_metrics = dfhistory[ metric]
val_metrics = dfhistory[ 'val_' + metric]
epochs = range ( 1 , len ( train_metrics) + 1 )
plt. plot( epochs, train_metrics, 'bo--' )
plt. plot( epochs, val_metrics, 'ro-' )
plt. title( 'Training and validation ' + metric)
plt. xlabel( "Epochs" )
plt. ylabel( metric)
plt. legend( [ "train_" + metric, 'val_' + metric] )
plt. show( )
plot_metric( dfhistory, "loss" )
plot_metric( dfhistory, "accuracy" )
保存模型
torch. save( model. state_dict( ) , './save/sst2_teacher_model.pt' )
model. config
模型蒸馏
import textbrewer
from textbrewer import GeneralDistiller
from textbrewer import TrainingConfig, DistillationConfig
from transformers import BertForSequenceClassification, BertConfig, AdamW, BertTokenizer
from transformers import get_linear_schedule_with_warmup
bert_config_T3 = BertConfig. from_json_file( './student_config/bert_base_cased_config/bert_config_L3.json' )
bert_config_T3. output_hidden_states = True
student_model = BertForSequenceClassification( bert_config_T3)
student_model. to( device= device)
bert_config = BertConfig. from_json_file( './student_config/bert_base_cased_config/bert_config.json' )
bert_config. output_hidden_states = True
teacher_model = BertForSequenceClassification( bert_config)
teacher_model. load_state_dict( torch. load( './save/sst2_teacher_model.pt' ) )
teacher_model. to( device= device)
BertForSequenceClassification(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(28996, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=2, bias=True)
)
num_epochs = 3
num_training_steps = len ( train_data_loader) * num_epochs
optimizer = AdamW( student_model. parameters( ) , lr= 1e-4 )
scheduler_class = get_linear_schedule_with_warmup
scheduler_args = { 'num_warmup_steps' : int ( 0.1 * num_training_steps) , 'num_training_steps' : num_training_steps}
def simple_adaptor ( batch, model_outputs) :
return { 'logits' : model_outputs. logits, 'hidden' : model_outputs. hidden_states}
distill_config = DistillationConfig(
intermediate_matches= [
{ 'layer_T' : 0 , 'layer_S' : 0 , 'feature' : 'hidden' , 'loss' : 'hidden_mse' , 'weight' : 1 } ,
{ 'layer_T' : 8 , 'layer_S' : 2 , 'feature' : 'hidden' , 'loss' : 'hidden_mse' , 'weight' : 1 } ] )
train_config = TrainingConfig( )
distiller = GeneralDistiller(
train_config= train_config, distill_config= distill_config,
model_T= teacher_model, model_S= student_model,
adaptor_T= simple_adaptor, adaptor_S= simple_adaptor)
with distiller:
distiller. train( optimizer, train_data_loader, num_epochs, scheduler_class= scheduler_class, scheduler_args = scheduler_args, callback= None )
C:\Users\22274\AppData\Local\Temp/ipykernel_13368/829142431.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
test_model = BertForSequenceClassification( bert_config_T3)
test_model. load_state_dict( torch. load( './saved_models/gs54.pkl' ) )
<All keys matched successfully>
test_model
BertForSequenceClassification(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(28996, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=2, bias=True)
)
test_model. eval ( )
for batch in val_data_loader:
batch = { k: v for k, v in batch. items( ) }
with torch. no_grad( ) :
outputs = test_model( ** batch)
logits = outputs. logits
predictions = torch. argmax( logits, dim= - 1 )
C:\Users\22274\AppData\Local\Temp/ipykernel_13368/829142431.py:7: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
predictions
tensor([1])
logits = outputs. logits
logits
tensor([[0.0050, 0.3522]])
torch. argmax( logits, dim= - 1 )
tensor([1])