1.分词器
from transformers import AutoTokenizer
tokenizer = AutoTokenizer. from_pretrained( 'distilbert-base-uncased' , use_fast= True )
print ( tokenizer)
DistilBertTokenizerFast( name_or_path= 'distilbert-base-uncased' , vocab_size= 30522 , model_max_length= 512 , is_fast= True , padding_side= 'right' , truncation_side= 'right' , special_tokens= { 'unk_token' : '[UNK]' , 'sep_token' : '[SEP]' , 'pad_token' : '[PAD]' , 'cls_token' : '[CLS]' , 'mask_token' : '[MASK]' } )
2.批编码
tokenizer. batch_encode_plus( [
'hello, everyone, today is a good day' ,
'how are you , fine thank you , and you?' ] )
{ 'input_ids' : [ [ 101 , 7592 , 1010 , 3071 , 1010 , 2651 , 2003 , 1037 , 2204 , 2154 , 102 ] , [ 101 , 2129 , 2024 , 2017 , 1010 , 2986 , 4067 , 2017 , 1010 , 1998 , 2017 , 1029 , 102 ] ] , 'attention_mask' : [ [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] , [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ] }
3.数据加载
from datasets import load_dataset
dataset = load_dataset( path= 'glue' , name= 'cola' )
dataset
0 % | | 0 / 3 [ 00 : 00 < ?, ?it/ s]
DatasetDict( {
train: Dataset( {
features: [ 'sentence' , 'label' , 'idx' ] ,
num_rows: 8551
} )
validation: Dataset( {
features: [ 'sentence' , 'label' , 'idx' ] ,
num_rows: 1043
} )
test: Dataset( {
features: [ 'sentence' , 'label' , 'idx' ] ,
num_rows: 1063
} )
} )
dataset[ 'train' ] [ 0 ]
{ 'sentence' : "Our friends won't buy this analysis, let alone the next one we propose." ,
'label' : 1 ,
'idx' : 0 }
4.数据集预处理 Sentence > Input_ids
def f ( examples, tokenizer) :
return tokenizer. batch_encode_plus( examples[ 'sentence' ] , truncation= True )
dataset = dataset. map ( f,
batched= True ,
batch_size= 1000 ,
num_proc= 1 ,
remove_columns= [ 'sentence' , 'idx' ] ,
fn_kwargs= { 'tokenizer' : tokenizer} )
0 % | | 0 / 9 [ 00 : 00 < ?, ?ba/ s]
0 % | | 0 / 2 [ 00 : 00 < ?, ?ba/ s]
0 % | | 0 / 2 [ 00 : 00 < ?, ?ba/ s]
print ( dataset[ 'train' ] [ 0 ] )
{ 'label' : 1 , 'input_ids' : [ 101 , 2256 , 2814 , 2180 , 1005 , 1056 , 4965 , 2023 , 4106 , 1010 , 2292 , 2894 , 1996 , 2279 , 2028 , 2057 , 16599 , 1012 , 102 ] , 'attention_mask' : [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] }
5.数据生成器
import torch
from transformers. data. data_collator import DataCollatorWithPadding
loader = torch. utils. data. DataLoader(
dataset= dataset[ 'train' ] ,
batch_size= 8 ,
collate_fn= DataCollatorWithPadding( tokenizer) ,
shuffle= True ,
drop_last= True , )
for data in loader:
break
data
{ 'input_ids' : tensor( [ [ 101 , 1996 , 2062 , 4180 , 2098 , 2057 , 14688 , 2000 , 2022 , 1010 ,
1996 , 2062 , 2057 , 3473 , 4854 , 2012 , 1996 , 7435 , 1012 , 102 ] ,
[ 101 , 2040 , 17749 , 2073 , 2057 , 4149 , 2054 , 1029 , 102 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 18500 , 2245 , 2008 , 2002 , 2001 , 1996 , 3159 , 1997 , 4768 ,
1012 , 102 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 1999 , 1996 , 4020 , 2045 , 28374 , 1037 , 2543 , 1012 , 102 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 2198 , 14140 , 2070 , 2769 , 1999 , 1996 , 2924 , 2006 , 5958 ,
1012 , 102 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 2320 , 9965 , 2187 , 1010 , 5965 , 2150 , 2035 , 1996 , 13675 ,
16103 , 2121 , 1012 , 102 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 2023 , 2311 , 2288 , 12283 , 1998 , 12283 , 1012 , 102 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 3389 , 2097 , 2196 , 2681 , 1012 , 102 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ] ) , 'attention_mask' : tensor( [ [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ] ) , 'labels' : tensor( [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ) }
len ( loader)
1068
6.加载预训练模型参数
from transformers import AutoModelForSequenceClassification, DistilBertModel
parameters = AutoModelForSequenceClassification. from_pretrained( 'distilbert-base-uncased' , num_labels= 2 )
parameters
DistilBertForSequenceClassification(
( distilbert) : DistilBertModel(
( embeddings) : Embeddings(
( word_embeddings) : Embedding( 30522 , 768 , padding_idx= 0 )
( position_embeddings) : Embedding( 512 , 768 )
( LayerNorm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( dropout) : Dropout( p= 0.1 , inplace= False )
)
( transformer) : Transformer(
( layer) : ModuleList(
( 0 ) : TransformerBlock(
( attention) : MultiHeadSelfAttention(
( dropout) : Dropout( p= 0.1 , inplace= False )
( q_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( k_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( v_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( out_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
)
( sa_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( ffn) : FFN(
( dropout) : Dropout( p= 0.1 , inplace= False )
( lin1) : Linear( in_features= 768 , out_features= 3072 , bias= True )
( lin2) : Linear( in_features= 3072 , out_features= 768 , bias= True )
( activation) : GELUActivation( )
)
( output_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
)
( 1 ) : TransformerBlock(
( attention) : MultiHeadSelfAttention(
( dropout) : Dropout( p= 0.1 , inplace= False )
( q_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( k_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( v_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( out_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
)
( sa_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( ffn) : FFN(
( dropout) : Dropout( p= 0.1 , inplace= False )
( lin1) : Linear( in_features= 768 , out_features= 3072 , bias= True )
( lin2) : Linear( in_features= 3072 , out_features= 768 , bias= True )
( activation) : GELUActivation( )
)
( output_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
)
( 2 ) : TransformerBlock(
( attention) : MultiHeadSelfAttention(
( dropout) : Dropout( p= 0.1 , inplace= False )
( q_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( k_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( v_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( out_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
)
( sa_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( ffn) : FFN(
( dropout) : Dropout( p= 0.1 , inplace= False )
( lin1) : Linear( in_features= 768 , out_features= 3072 , bias= True )
( lin2) : Linear( in_features= 3072 , out_features= 768 , bias= True )
( activation) : GELUActivation( )
)
( output_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
)
( 3 ) : TransformerBlock(
( attention) : MultiHeadSelfAttention(
( dropout) : Dropout( p= 0.1 , inplace= False )
( q_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( k_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( v_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( out_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
)
( sa_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( ffn) : FFN(
( dropout) : Dropout( p= 0.1 , inplace= False )
( lin1) : Linear( in_features= 768 , out_features= 3072 , bias= True )
( lin2) : Linear( in_features= 3072 , out_features= 768 , bias= True )
( activation) : GELUActivation( )
)
( output_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
)
( 4 ) : TransformerBlock(
( attention) : MultiHeadSelfAttention(
( dropout) : Dropout( p= 0.1 , inplace= False )
( q_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( k_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( v_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( out_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
)
( sa_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( ffn) : FFN(
( dropout) : Dropout( p= 0.1 , inplace= False )
( lin1) : Linear( in_features= 768 , out_features= 3072 , bias= True )
( lin2) : Linear( in_features= 3072 , out_features= 768 , bias= True )
( activation) : GELUActivation( )
)
( output_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
)
( 5 ) : TransformerBlock(
( attention) : MultiHeadSelfAttention(
( dropout) : Dropout( p= 0.1 , inplace= False )
( q_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( k_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( v_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
( out_lin) : Linear( in_features= 768 , out_features= 768 , bias= True )
)
( sa_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
( ffn) : FFN(
( dropout) : Dropout( p= 0.1 , inplace= False )
( lin1) : Linear( in_features= 768 , out_features= 3072 , bias= True )
( lin2) : Linear( in_features= 3072 , out_features= 768 , bias= True )
( activation) : GELUActivation( )
)
( output_layer_norm) : LayerNorm( ( 768 , ) , eps= 1e-12 , elementwise_affine= True )
)
)
)
)
( pre_classifier) : Linear( in_features= 768 , out_features= 768 , bias= True )
( classifier) : Linear( in_features= 768 , out_features= 2 , bias= True )
( dropout) : Dropout( p= 0.2 , inplace= False )
)
7.定义下游任务模型
class Model ( torch. nn. Module) :
def __init__ ( self) :
super ( ) . __init__( )
self. pretrained = DistilBertModel. from_pretrained( 'distilbert-base-uncased' )
self. fc = torch. nn. Sequential( torch. nn. Linear( 768 , 768 ) ,
torch. nn. ReLU( ) ,
torch. nn. Dropout( p= 0.2 ) ,
torch. nn. Linear( 768 , 2 ) )
parameters = AutoModelForSequenceClassification. from_pretrained( 'distilbert-base-uncased' , num_labels= 2 )
self. fc[ 0 ] . load_state_dict( parameters. pre_classifier. state_dict( ) )
self. fc[ 3 ] . load_state_dict( parameters. classifier. state_dict( ) )
self. criterion = torch. nn. CrossEntropyLoss( )
def forward ( self, input_ids, attention_mask, labels= None ) :
logits = self. pretrained( input_ids= input_ids, attention_mask= attention_mask)
logits = logits. last_hidden_state[ : , 0 ]
logits = self. fc( logits)
loss = None
if labels is not None :
loss = self. criterion( logits, labels)
return { 'loss' : loss, 'logits' : logits}
model = Model( )
print ( sum ( i. numel( ) for i in model. parameters( ) ) )
66955010
out = model( ** data)
out[ 'loss' ] , out[ 'logits' ] . shape
( tensor( 0.6949 , grad_fn= < NllLossBackward0> ) , torch. Size( [ 8 , 2 ] ) )
data
{ 'input_ids' : tensor( [ [ 101 , 1996 , 2062 , 4180 , 2098 , 2057 , 14688 , 2000 , 2022 , 1010 ,
1996 , 2062 , 2057 , 3473 , 4854 , 2012 , 1996 , 7435 , 1012 , 102 ] ,
[ 101 , 2040 , 17749 , 2073 , 2057 , 4149 , 2054 , 1029 , 102 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 18500 , 2245 , 2008 , 2002 , 2001 , 1996 , 3159 , 1997 , 4768 ,
1012 , 102 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 1999 , 1996 , 4020 , 2045 , 28374 , 1037 , 2543 , 1012 , 102 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 2198 , 14140 , 2070 , 2769 , 1999 , 1996 , 2924 , 2006 , 5958 ,
1012 , 102 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 2320 , 9965 , 2187 , 1010 , 5965 , 2150 , 2035 , 1996 , 13675 ,
16103 , 2121 , 1012 , 102 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 2023 , 2311 , 2288 , 12283 , 1998 , 12283 , 1012 , 102 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 101 , 3389 , 2097 , 2196 , 2681 , 1012 , 102 , 0 , 0 , 0 ,
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ] ) , 'attention_mask' : tensor( [ [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ] ] ) , 'labels' : tensor( [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ) }
dataset[ 'test' ] [ 0 ]
{ 'label' : - 1 ,
'input_ids' : [ 101 , 3021 , 26265 , 2627 , 1996 , 2160 , 1012 , 102 ] ,
'attention_mask' : [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] }
dataset[ 'validation' ] [ 0 ]
{ 'label' : 1 ,
'input_ids' : [ 101 ,
1996 ,
11279 ,
8469 ,
1996 ,
9478 ,
3154 ,
1997 ,
1996 ,
5749 ,
1012 ,
102 ] ,
'attention_mask' : [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] }
8.测试1
def test ( model) :
model. eval ( )
loader_test = torch. utils. data. DataLoader(
dataset= dataset[ 'validation' ] ,
batch_size= 16 ,
collate_fn= DataCollatorWithPadding( tokenizer) ,
shuffle= True ,
drop_last= True )
outs = [ ]
labels = [ ]
for i, data in enumerate ( loader_test) :
with torch. no_grad( ) :
out = model( ** data)
outs. append( out[ 'logits' ] . argmax( dim= 1 ) )
labels. append( data[ 'labels' ] )
if i % 10 == 0 :
print ( i)
if i == 50 :
break
outs = torch. cat( outs)
labels = torch. cat( labels)
accuracy = ( outs == labels) . sum ( ) . item( ) / len ( labels)
print ( 'accuracy: ' , accuracy)
test( model)
0
10
20
30
40
50
accuracy: 0.5502450980392157
9.训练
from transformers import AdamW
from transformers. optimization import get_scheduler
device = torch. device( 'cuda:0' if torch. cuda. is_available( ) else 'cpu' )
device
device( type = 'cuda' , index= 0 )
def train ( ) :
optimizer = AdamW( model. parameters( ) , betas= ( 0.9 , 0.999 ) , eps= 1e-8 , lr= 2e-5 )
scheduler = get_scheduler( name= 'linear' ,
num_warmup_steps= 0 ,
num_training_steps= len ( loader) ,
optimizer= optimizer)
model. to( device)
model. train( )
for i, data in enumerate ( loader) :
input_ids, attention_mask, labels = data[ 'input_ids' ] , data[ 'attention_mask' ] , data[ 'labels' ]
input_ids = input_ids. to( device)
attention_mask = attention_mask. to( device)
labels = labels. to( device)
out = model( input_ids= input_ids,
attention_mask= attention_mask,
labels= labels)
loss = out[ 'loss' ]
loss. backward( )
torch. nn. utils. clip_grad_norm_( model. parameters( ) , 1.0 )
optimizer. step( )
scheduler. step( )
optimizer. zero_grad( )
model. zero_grad( )
if i % 50 == 0 :
lr = optimizer. state_dict( ) [ 'param_groups' ] [ 0 ] [ 'lr' ]
out = out[ 'logits' ] . argmax( dim= 1 )
accuracy = ( labels== out) . sum ( ) . item( ) / 8
print ( i, loss. item( ) , lr, accuracy)
print ( )
train( )
10.测试2
test( model. to( 'cpu' ) )
0
10
20
30
40
50
accuracy: 0.7781862745098039