1.加载编码器 分词
from transformers import AutoTokenizer
tokenizer = AutoTokenizer. from_pretrained( 'distilroberta-base' )
print ( tokenizer)
Downloading ( …) / main/ tokenizer. json: 0 % | | 0.00 / 1 . 36M [ 00 : 00 < ?, ?B/ s]
RobertaTokenizerFast( name_or_path= 'distilroberta-base' , vocab_size= 50265 , model_max_length= 512 , is_fast= True , padding_side= 'right' , truncation_side= 'right' , special_tokens= { 'bos_token' : '<s>' , 'eos_token' : '</s>' , 'unk_token' : '<unk>' , 'sep_token' : '</s>' , 'pad_token' : '<pad>' , 'cls_token' : '<s>' , 'mask_token' : AddedToken( "<mask>" , rstrip= False , lstrip= True , single_word= False , normalized= False ) } )
2.批编码试算
tokenizer. batch_encode_plus( [
'hide new secretions from the parental units' ,
'this moive is great' ] )
{ 'input_ids' : [ [ 0 , 37265 , 92 , 3556 , 2485 , 31 , 5 , 20536 , 2833 , 2 ] , [ 0 , 9226 , 7458 , 2088 , 16 , 372 , 2 ] ] , 'attention_mask' : [ [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] , [ 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ] }
3.加载数据
from datasets import load_dataset, load_from_disk
dataset = load_from_disk( '../data/glue_sst2/' )
dataset
DatasetDict( {
train: Dataset( {
features: [ 'sentence' , 'label' , 'idx' ] ,
num_rows: 67349
} )
validation: Dataset( {
features: [ 'sentence' , 'label' , 'idx' ] ,
num_rows: 872
} )
test: Dataset( {
features: [ 'sentence' , 'label' , 'idx' ] ,
num_rows: 1821
} )
} )
4.对每条数据Sentence部分进行批编码
def f ( data, tokenizer) :
return tokenizer. batch_encode_plus( data[ 'sentence' ] )
dataset = dataset. map ( f,
batched= True ,
batch_size= 1000 ,
num_proc= 12 ,
remove_columns= [ 'sentence' , 'idx' , 'label' ] ,
fn_kwargs= { 'tokenizer' : tokenizer} )
5.过滤掉短句子
def f ( data) :
return [ len ( i) >= 9 for i in data[ 'input_ids' ] ]
dataset = dataset. filter ( f, batched= True , batch_size= 1000 , num_proc= 12 )
dataset
DatasetDict( {
train: Dataset( {
features: [ 'input_ids' , 'attention_mask' ] ,
num_rows: 44279
} )
validation: Dataset( {
features: [ 'input_ids' , 'attention_mask' ] ,
num_rows: 861
} )
test: Dataset( {
features: [ 'input_ids' , 'attention_mask' ] ,
num_rows: 1776
} )
} )
tokenizer. get_vocab( ) [ '<mask>' ]
50264
6.截断句子, 同时整理成模型需要的格式
def f ( data) :
b = len ( data[ 'input_ids' ] )
data[ 'labels' ] = data[ 'attention_mask' ] . copy( )
for i in range ( b) :
data[ 'input_ids' ] [ i] = data[ 'input_ids' ] [ i] [ : 9 ]
data[ 'attention_mask' ] [ i] = [ 1 ] * 9
data[ 'labels' ] [ i] = [ - 100 ] * 9
data[ 'input_ids' ] [ i] [ - 1 ] = 2
data[ 'labels' ] [ i] [ 4 ] = data[ 'input_ids' ] [ i] [ 4 ]
data[ 'input_ids' ] [ i] [ 4 ] = 50264
return data
import transformers
dataset = dataset. map ( f, batched= True , batch_size= 1000 , num_proc= 12 )
dataset
DatasetDict( {
train: Dataset( {
features: [ 'input_ids' , 'attention_mask' , 'labels' ] ,
num_rows: 44279
} )
validation: Dataset( {
features: [ 'input_ids' , 'attention_mask' , 'labels' ] ,
num_rows: 861
} )
test: Dataset( {
features: [ 'input_ids' , 'attention_mask' , 'labels' ] ,
num_rows: 1776
} )
} )
dataset[ 'train' ] [ 0 ]
{ 'input_ids' : [ 0 , 37265 , 92 , 3556 , 50264 , 31 , 5 , 20536 , 2 ] ,
'attention_mask' : [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ,
'labels' : [ - 100 , - 100 , - 100 , - 100 , 2485 , - 100 , - 100 , - 100 , - 100 ] }
7.数据加载器
import torch
from transformers. data. data_collator import default_data_collator
loader = torch. utils. data. DataLoader(
dataset= dataset[ 'train' ] ,
batch_size= 8 ,
collate_fn= default_data_collator,
shuffle= True ,
drop_last= True )
for data in loader:
break
len ( loader) , data
( 5534 ,
{ 'input_ids' : tensor( [ [ 0 , 1264 , 9 , 475 , 50264 , 4 , 1855 , 873 , 2 ] ,
[ 0 , 8155 , 34 , 1348 , 50264 , 888 , 2609 , 5 , 2 ] ,
. . .
[ 0 , 10859 , 2156 , 1537 , 50264 , 16016 , 66 , 5 , 2 ] ,
[ 0 , 19746 , 47 , 619 , 50264 , 47 , 393 , 236 , 2 ] ] ) ,
'attention_mask' : tensor( [ [ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ,
. . .
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ] ] ) ,
'labels' : tensor( [ [ - 100 , - 100 , - 100 , - 100 , 338 , - 100 , - 100 , - 100 , - 100 ] ,
[ - 100 , - 100 , - 100 , - 100 , 40350 , - 100 , - 100 , - 100 , - 100 ] ,
. . .
[ - 100 , - 100 , - 100 , - 100 , 1472 , - 100 , - 100 , - 100 , - 100 ] ,
[ - 100 , - 100 , - 100 , - 100 , 14 , - 100 , - 100 , - 100 , - 100 ] ] ) } )
8.定义下游任务模型
from transformers import RobertaModel, AutoModelForCausalLM
class Model ( torch. nn. Module) :
def __init__ ( self) :
super ( ) . __init__( )
self. pretrained = RobertaModel. from_pretrained( 'distilroberta-base' )
decoder = torch. nn. Linear( 768 , tokenizer. vocab_size)
decoder. bias = torch. nn. Parameter( torch. zeros( tokenizer. vocab_size) )
self. fc = torch. nn. Sequential(
torch. nn. Linear( 768 , 768 ) ,
torch. nn. GELU( ) ,
torch. nn. LayerNorm( 768 , eps= 1e-5 ) ,
decoder)
parameters = AutoModelForCausalLM. from_pretrained( 'distilroberta-base' )
self. fc[ 0 ] . load_state_dict( parameters. lm_head. dense. state_dict( ) )
self. fc[ 2 ] . load_state_dict( parameters. lm_head. layer_norm. state_dict( ) )
self. fc[ 3 ] . load_state_dict( parameters. lm_head. decoder. state_dict( ) )
self. criterion = torch. nn. CrossEntropyLoss( )
def forward ( self, input_ids, attention_mask, labels= None ) :
logits = self. pretrained( input_ids, attention_mask)
logits = logits. last_hidden_state
logits = self. fc( logits)
loss = None
if labels is not None :
shifted_logits = logits[ : , : - 1 ] . reshape( - 1 , tokenizer. vocab_size)
shifted_labels = labels[ : , 1 : ] . reshape( - 1 )
loss = self. criterion( shifted_logits, shifted_labels)
return { 'loss' : loss, 'logits' : logits}
model = Model( )
Downloading ( …) "pytorch_model.bin" ; : 0 % | | 0.00 / 331M [ 00 : 00 < ?, ?B/ s]
print ( sum ( i. numel( ) for i in model. parameters( ) ) )
121364313
out = model( ** data)
out[ 'loss' ] , out[ 'logits' ] . shape
( tensor( 19.5734 , grad_fn= < NllLossBackward0> ) , torch. Size( [ 8 , 9 , 50265 ] ) )
9.测试1
def test ( model) :
model. eval ( )
loader_test = torch. utils. data. DataLoader(
dataset = dataset[ 'test' ] ,
batch_size= 8 ,
collate_fn= default_data_collator,
shuffle= True ,
drop_last= True )
correct = 0
total = 0
for i, data in enumerate ( loader_test) :
label = data[ 'labels' ] [ : , 4 ] . clone( )
data[ 'labels' ] = None
with torch. no_grad( ) :
out = model( ** data)
out = out[ 'logits' ] . argmax( dim= 2 ) [ : , 4 ]
correct += ( label == out) . sum ( ) . item( )
total += 8
if i % 10 == 0 :
print ( i)
print ( label)
print ( out)
if i == 50 :
break
print ( 'accuracy: ' , correct / total)
for i in range ( 8 ) :
print ( tokenizer. decode( data[ 'input_ids' ] [ i] ) )
print ( tokenizer. decode( label[ i] ) , tokenizer. decode( out[ i] ) )
print ( )
test( model)
0
tensor( [ 822 , 2789 , 11783 , 408 , 9 , 241 , 1073 , 12 ] )
tensor( [ 1816 , 13536 , 30609 , 13670 , 396 , 241 , 23250 , 12 ] )
. . .
50
tensor( [ 5 , 29 , 480 , 10238 , 110 , 9 , 10 , 2156 ] )
tensor( [ 664 , 29 , 4338 , 21844 , 239 , 9 , 98 , 878 ] )
accuracy: 0.35294117647058826
< s> the talents of< mask> actors helps ``< / s>
the young
< s> when it '< mask> all wet, < / s>
s s
< s> wait for video< mask> and then do< / s>
- - footage
< s> a tender and < mask> drama, based< / s>
touching heartfelt
< s> if you pitch< mask> expectations at an< / s>
your high
< s> a gentle blend< mask> present day testim< / s>
of of
< s> theirs is < mask> simple and heart< / s>
a so
< s> morton is < mask> as usual, < / s>
, running
10.训练
device = torch. device( 'cuda:0' if torch. cuda. is_available( ) else 'cpu' )
device
device( type = 'cuda' , index= 0 )
from transformers import AdamW
from transformers. optimization import get_scheduler
def train ( ) :
optimizer = AdamW( model. parameters( ) , lr= 2e-5 )
scheduler = get_scheduler( name= 'linear' ,
num_warmup_steps= 0 ,
num_training_steps= len ( loader) ,
optimizer= optimizer)
model. to( device)
model. train( )
for i, data in enumerate ( loader) :
input_ids, attention_mask, labels = data[ 'input_ids' ] , data[ 'attention_mask' ] , data[ 'labels' ]
input_ids, attention_mask, labels = input_ids. to( device) , attention_mask. to( device) , labels. to( device)
out = model( input_ids= input_ids, attention_mask= attention_mask, labels= labels)
loss = out[ 'loss' ]
loss. backward( )
torch. nn. utils. clip_grad_norm_( model. parameters( ) , 1.0 )
optimizer. step( )
scheduler. step( )
optimizer. zero_grad( )
model. zero_grad( )
if i % 50 == 0 :
label = data[ 'labels' ] [ : , 4 ] . to( device)
out = out[ 'logits' ] . argmax( dim= 2 ) [ : , 4 ]
correct = ( label == out) . sum ( ) . item( )
accuracy = correct / 8
lr = optimizer. state_dict( ) [ 'param_groups' ] [ 0 ] [ 'lr' ]
print ( i, loss. item( ) , accuracy, lr)
train( )
0 18.58106231689453 0.25 1.9996385977593064e-05
50 6.724194049835205 0.125 1.9815684857246115e-05
. . .
5450 2.665070056915283 0.375 2.9996385977593064e-07
5500 1.7032554149627686 0.625 1.1926273942898448e-07
11.保存模型
torch. save( model, '../data/预测中间词.model' )
12.加载模型
model2 = torch. load( '../data/预测中间词.model' , map_location= 'cpu' )
13.测试2
test( model2)
0
tensor( [ 5712 , 2156 , 409 , 6670 , 576 , 189 , 23 , 295 ] )
tensor( [ 5712 , 2156 , 409 , 34 , 576 , 189 , 31 , 295 ] )
. .
50
tensor( [ 3486 , 143 , 19 , 10713 , 5 , 5 , 668 , 32894 ] )
tensor( [ 3486 , 41 , 19 , 10713 , 10 , 44009 , 668 , 32894 ] )
accuracy: 0.5098039215686274
< s> neither funny< mask> suspenseful nor< / s>
nor nor
< s> and forget about< mask> attempt at a< / s>
any an
< s> a film made< mask> as little wit< / s>
with with
< s> here, ad< mask> lyne comes< / s>
rian rian
< s> the fact that< mask> rookie is a< / s>
the a
< s> the scope of< mask> silberstein< / s>
the david
< s> reign of< mask> may be little< / s>
fire fire
< s> a solidly seaw< mask> chiller. < / s>
orthy orthy