import os
for dirname, _, filenames in os. walk( '/kaggle/' ) :
for filename in filenames:
print ( os. path. join( dirname, filename) )
'''
/kaggle/lib/kaggle/gcp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/dataset_info.json
/kaggle/input/chnsenticorp/ChnSentiCorp/ChnSentiCorp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-train.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-test.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-validation.arrow
/kaggle/working/__notebook_source__.ipynb
'''
文本二分类
from datasets import load_dataset
from torch. utils. data import Dataset, DataLoader
import torch
1. 定义数据集
class Dataset ( Dataset) :
def __init__ ( self, split) :
self. dataset = load_dataset( path= '/kaggle/input/chnsenticorp/ChnSentiCorp' , split= split)
def __len__ ( self) :
return len ( self. dataset)
def __getitem__ ( self, item) :
text = self. dataset[ item] [ 'text' ]
label = self. dataset[ item] [ 'label' ]
return text, label
dataset = Dataset( 'train' )
len ( dataset) , dataset[ 0 ]
'''
(9600,
('选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。
酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。
包的早餐是西式的,还算丰富。 服务吗,一般',
1))
'''
2. 加载tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer. from_pretrained( 'bert-base-chinese' )
tokenizer
'''
Downloading: 100%
107k/107k [00:00<00:00, 192kB/s]
Downloading: 100%
29.0/29.0 [00:00<00:00, 922B/s]
Downloading: 100%
624/624 [00:00<00:00, 21.1kB/s]
PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right',
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''
3. 定义批处理函数
def collate_fn ( data) :
sents = [ i[ 0 ] for i in data]
labels = [ i[ 1 ] for i in data]
data = tokenizer. batch_encode_plus( batch_text_or_text_pairs= sents
, max_length= 500
, padding= 'max_length'
, truncation= True
, return_tensors= 'pt'
, return_length= True
)
input_ids = data[ 'input_ids' ]
attention_mask = data[ 'attention_mask' ]
token_type_ids = data[ 'token_type_ids' ]
labels = torch. LongTensor( labels)
return input_ids, attention_mask, token_type_ids, labels
4. DataLoader
loader = DataLoader( dataset= dataset
, batch_size= 16
, collate_fn= collate_fn
, shuffle= True
, drop_last= True
)
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
break
print ( len ( loader) )
input_ids. shape, attention_mask. shape, token_type_ids. shape, labels
'''
600
(torch.Size([16, 500]),
torch.Size([16, 500]),
torch.Size([16, 500]),
tensor([1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1]))
'''
5. 加载bert中文模型
from transformers import BertModel
pretrained = BertModel. from_pretrained( 'bert-base-chinese' )
for param in pretrained. parameters( ) :
param. requires_grad_( False )
out = pretrained( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out. last_hidden_state. shape
'''
torch.Size([16, 500, 768])
'''
6. 定义下游任务模型
class Model ( torch. nn. Module) :
def __init__ ( self) :
super ( ) . __init__( )
self. fc = torch. nn. Linear( 768 , 2 )
def forward ( self, input_ids, attention_mask, token_type_ids) :
with torch. no_grad( ) :
out = pretrained( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out = self. fc( out. last_hidden_state[ : , 0 ] )
out = out. softmax( dim= 1 )
return out
model = Model( )
model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
'''
tensor([[0.4310, 0.5690],
[0.4490, 0.5510],
[0.3135, 0.6865],
[0.4604, 0.5396],
[0.5144, 0.4856],
[0.5671, 0.4329],
[0.4388, 0.5612],
[0.6065, 0.3935],
[0.5771, 0.4229],
[0.6195, 0.3805],
[0.4489, 0.5511],
[0.4380, 0.5620],
[0.4747, 0.5253],
[0.3910, 0.6090],
[0.4087, 0.5913],
[0.5216, 0.4784]], grad_fn=<SoftmaxBackward0>)
'''
7. 训练
from torch. optim import AdamW
optimizer = AdamW( model. parameters( ) , lr= 5e-4 )
criterion = torch. nn. CrossEntropyLoss( )
model. train( )
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
out = model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
loss = criterion( out, labels)
loss. backward( )
optimizer. step( )
optimizer. zero_grad( )
if i % 5 == 0 :
out = out. argmax( dim= 1 )
accuracy = ( out == labels) . sum ( ) . item( ) / len ( labels)
print ( i, loss. item( ) , accuracy)
if i == 10 :
break
'''
0 0.7164372801780701 0.4375
5 0.6556262969970703 0.8125
10 0.6778689622879028 0.5
'''
8. 测试
def test ( ) :
model. eval ( )
correct = 0
total = 0
loader_test = torch. utils. data. DataLoader( dataset= Dataset( 'validation' )
, batch_size= 32
, collate_fn= collate_fn
, shuffle= True
, drop_last= True
)
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader_test) :
if i == 5 :
break
print ( i)
with torch. no_grad( ) :
out = model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out = out. argmax( dim= 1 )
correct += ( out == labels) . sum ( ) . item( )
total += len ( labels)
print ( correct / total)
test( )
'''
0
1
2
3
4
0.625
'''
9. 模型保存和加载
保存模型parameter、buffer
path = '/kaggle/working/state_dict_model.pt'
torch. save( model. state_dict( ) , path)
n1_model = Model( )
n1_model. load_state_dict( torch. load( path) )
n1_model. eval ( )
'''
Model(
(fc): Linear(in_features=768, out_features=2, bias=True)
)
'''
保存整个模型
path = '/kaggle/working/entire_model.pt'
torch. save( model, path)
n2_model = torch. load( path)
n2_model. eval ( )
'''
Model(
(fc): Linear(in_features=768, out_features=2, bias=True)
)
'''
10. checkpoint 保存和加载
epoch = 5
loss = 0.4
path = '/kaggle/working/5_0.4_checkpoint.pt'
torch. save( {
'epoch' : epoch
, 'loss' : loss
, 'model_state_dict' : model. state_dict( )
, 'optimizer_state_dict' : optimizer. state_dict( )
,
} , path)
n3_model = Model( )
n3_optimizer = torch. optim. AdamW( model. parameters( ) , lr= 5e-4 )
checkpoint = torch. load( path)
epoch = checkpoint[ 'epoch' ]
loss = checkpoint[ 'loss' ]
n3_model. load_state_dict( checkpoint[ 'model_state_dict' ] )
n3_optimizer. load_state_dict( checkpoint[ 'optimizer_state_dict' ] )
n3_model. eval ( )
n3_model. train( )
import os
for dirname, _, filenames in os. walk( '/kaggle/' ) :
for filename in filenames:
print ( os. path. join( dirname, filename) )
'''
/kaggle/lib/kaggle/gcp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/dataset_info.json
/kaggle/input/chnsenticorp/ChnSentiCorp/ChnSentiCorp.py
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-train.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-test.arrow
/kaggle/input/chnsenticorp/ChnSentiCorp/chn_senti_corp-validation.arrow
/kaggle/working/state_dict_model.pt
/kaggle/working/__notebook_source__.ipynb
/kaggle/working/5_0.4_checkpoint.pt
/kaggle/working/entire_model.pt
'''
文本填空
import torch
from datasets import load_dataset
class Dataset ( torch. utils. data. Dataset) :
def __init__ ( self, split) :
dataset = load_dataset( path= '/kaggle/input/chnsenticorp/ChnSentiCorp' , split= split)
def f ( data) :
return len ( data[ 'text' ] ) > 30
self. dataset = dataset. filter ( f)
def __len__ ( self) :
return len ( self. dataset)
def __getitem__ ( self, item) :
text = self. dataset[ item] [ 'text' ]
return text
dataset = Dataset( 'train' )
len ( dataset) , dataset[ 0 ]
'''
(9192,
'选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。
酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。
包的早餐是西式的,还算丰富。 服务吗,一般')
'''
from transformers import BertTokenizer
tokenizer = BertTokenizer. from_pretrained( 'bert-base-chinese' )
tokenizer
'''
PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right',
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''
def collate_fn ( data) :
data = tokenizer. batch_encode_plus( batch_text_or_text_pairs= data
, max_length= 30
, padding= 'max_length'
, truncation= True
, return_tensors= 'pt'
, return_length= True
)
input_ids = data[ 'input_ids' ]
attention_mask = data[ 'attention_mask' ]
token_type_ids = data[ 'token_type_ids' ]
labels = input_ids[ : , 15 ] . reshape( - 1 ) . clone( )
input_ids[ : , 15 ] = tokenizer. get_vocab( ) [ tokenizer. mask_token]
return input_ids, attention_mask, token_type_ids, labels
loader = torch. utils. data. DataLoader( dataset= dataset
, batch_size= 16
, collate_fn= collate_fn
, shuffle= True
, drop_last= True
)
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
break
print ( len ( loader) )
print ( tokenizer. decode( input_ids[ 0 ] ) )
print ( tokenizer. decode( labels[ 0 ] ) )
input_ids. shape, attention_mask. shape, token_type_ids. shape, labels. shape
'''
574
[CLS] 看 完 之 后 兴 奋 了 很 久... 因 为 [MASK] 也 处 于 那 种 阶 段... 值 得 学 [SEP]
我
(torch.Size([16, 30]),
torch.Size([16, 30]),
torch.Size([16, 30]),
torch.Size([16]))
'''
from transformers import BertModel
pretrained = BertModel. from_pretrained( 'bert-base-chinese' )
for param in pretrained. parameters( ) :
param. requires_grad_( False )
out = pretrained( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out. last_hidden_state. shape
linear = torch. nn. Linear( 768 , len ( tokenizer. get_vocab( ) ) , bias= False )
res = linear( out. last_hidden_state[ : , 15 ] )
res
'''
tensor([[ 0.1829, -0.5311, -0.1989, ..., -0.1905, -0.4505, -0.1490],
[ 0.3157, -0.8063, 0.0054, ..., -0.1232, -0.1233, -0.0622],
[-0.4099, -0.4986, 0.1289, ..., -0.6015, -0.0355, -0.3704],
...,
[-0.5781, -0.4431, -0.1011, ..., -0.1154, -0.4494, -0.1650],
[ 0.2260, -0.3658, -0.2587, ..., -0.3548, -0.2574, 0.4675],
[-0.2136, -0.1950, 0.0240, ..., -0.3630, -0.3018, -0.5322]],
grad_fn=<MmBackward0>)
'''
class Model ( torch. nn. Module) :
def __init__ ( self) :
super ( Model, self) . __init__( )
self. decoder = torch. nn. Linear( 768 , len ( tokenizer. get_vocab( ) ) , bias= False )
def forward ( self, input_ids, attention_mask, token_type_ids) :
with torch. no_grad( ) :
out = pretrained( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out = self. decoder( out. last_hidden_state[ : , 15 ] )
return out
model = Model( )
model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
) . shape
'''
torch.Size([16, 21128])
'''
from torch. optim import AdamW
optimizer = AdamW( model. parameters( ) , lr= 5e-4 )
criterion = torch. nn. CrossEntropyLoss( )
model. train( )
for epoch in range ( 1 ) :
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
out = model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
loss = criterion( out, labels)
loss. backward( )
optimizer. step( )
optimizer. zero_grad( )
if i % 50 == 0 :
out = out. argmax( dim= 1 )
accuracy = ( out == labels) . sum ( ) . item( ) / len ( labels)
print ( epoch, i, loss. item( ) , accuracy)
'''
0 0 1.883865237236023 0.625
0 50 2.328688144683838 0.6875
0 100 1.7976388931274414 0.8125
0 150 1.6133203506469727 0.6875
0 200 0.9813346862792969 0.9375
0 250 2.2197675704956055 0.625
0 300 2.4435458183288574 0.6875
0 350 2.027521848678589 0.6875
0 400 1.2056934833526611 0.8125
0 450 2.092010021209717 0.75
0 500 0.865594744682312 0.875
0 550 1.6895039081573486 0.625
'''
def test ( ) :
model. eval ( )
correct = 0
total = 0
loader = torch. utils. data. DataLoader( dataset= Dataset( 'test' )
, batch_size= 32
, collate_fn= collate_fn
, shuffle= True
, drop_last= True
)
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
if i == 15 :
break
print ( i)
with torch. no_grad( ) :
out = model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out = out. argmax( dim= 1 )
correct += ( out == labels) . sum ( ) . item( )
total += len ( labels)
print ( correct / total)
test( )
'''
0
1
2
...
13
14
0.6229166666666667
'''
句子关系
import torch
import random
from datasets import load_dataset
class Dataset ( torch. utils. data. Dataset) :
def __init__ ( self, split) :
dataset = load_dataset( path= '/kaggle/input/chnsenticorp/ChnSentiCorp' , split= split)
def f ( data) :
return len ( data[ 'text' ] ) > 40
self. dataset = dataset. filter ( f)
def __len__ ( self) :
return len ( self. dataset)
def __getitem__ ( self, item) :
text = self. dataset[ item] [ 'text' ]
sentence1 = text[ : 20 ]
sentence2 = text[ 20 : 40 ]
label = 0
if random. randint( 0 , 1 ) == 0 :
idx = random. randint( 0 , len ( self. dataset) - 1 )
sentence2 = self. dataset[ idx] [ 'text' ] [ 20 : 40 ]
label = 1
return sentence1, sentence2, label
dataset = Dataset( 'train' )
print ( len ( dataset) )
print ( dataset[ 0 ] )
'''
8001
('选择珠江花园的原因就是方便,有电动扶梯直'
, '台,居然说我没有预定,折腾了半天,郁闷,'
, 1)
'''
from transformers import BertTokenizer
tokenizer = BertTokenizer. from_pretrained( 'bert-base-chinese' )
tokenizer
'''
PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
'''
print ( dataset[ 0 ] [ : 2 ] )
print ( dataset[ 0 ] [ 2 ] )
'''
('选择珠江花园的原因就是方便,有电动扶梯直'
, '接到达海边,周围餐馆、食廊、商场、超市、')
0
'''
def collate_fn ( dataset) :
sents = [ i[ : 2 ] for i in dataset]
labels = [ i[ 2 ] for i in dataset]
data = tokenizer. batch_encode_plus( batch_text_or_text_pairs= sents
, max_length= 45
, padding= 'max_length'
, truncation= True
, return_tensors= 'pt'
, return_length= True
, add_special_tokens= True
)
input_ids = data[ 'input_ids' ]
attention_mask = data[ 'attention_mask' ]
token_type_ids = data[ 'token_type_ids' ]
labels = torch. LongTensor( labels)
return input_ids, attention_mask, token_type_ids, labels
loader = torch. utils. data. DataLoader( dataset= dataset
, batch_size= 8
, collate_fn= collate_fn
, shuffle= True
, drop_last= True
)
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
break
print ( 'len(loader)' , len ( loader) )
print ( tokenizer. decode( input_ids[ 0 ] ) )
input_ids. shape, attention_mask. shape, token_type_ids. shape, labels. shape
'''
len(loader) 1000
[CLS] 这 套 书 也 是 因 为 高 居 榜 首 之 左 右 , 才 决 定 买 的 [SEP] 。 买 来 之 后 , 心 情 很 激 动 , 认 为 这 下 可 以 和 女 [SEP] [PAD] [PAD]
(torch.Size([8, 45]),
torch.Size([8, 45]),
torch.Size([8, 45]),
torch.Size([8]))
'''
from transformers import BertModel
pretrained = BertModel. from_pretrained( 'bert-base-chinese' )
for param in pretrained. parameters( ) :
param. requires_grad_( False )
out = pretrained( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out. last_hidden_state. shape
'''
torch.Size([8, 45, 768])
'''
class Model ( torch. nn. Module) :
def __init__ ( self) :
super ( Model, self) . __init__( )
self. linear = torch. nn. Linear( 768 , 2 )
def forward ( self, input_ids, attention_mask, token_type_ids) :
out = pretrained( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out = self. linear( out. last_hidden_state[ : , 0 ] )
out = out. softmax( dim= 1 )
return out
model = Model( )
model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
'''
tensor([[0.6049, 0.3951],
[0.6584, 0.3416],
[0.6334, 0.3666],
[0.6860, 0.3140],
[0.8079, 0.1921],
[0.6332, 0.3668],
[0.6906, 0.3094],
[0.7661, 0.2339]], grad_fn=<SoftmaxBackward0>)
'''
optimizer = torch. optim. AdamW( model. parameters( ) , lr= 5e-4 )
criterion = torch. nn. CrossEntropyLoss( )
model. train( )
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
out = model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
loss = criterion( out, labels)
loss. backward( )
optimizer. step( )
optimizer. zero_grad( )
if i % 5 == 0 :
out = out. argmax( dim= 1 )
accuracy = ( out == labels) . sum ( ) . item( ) / len ( labels)
print ( i, loss. item( ) , accuracy)
if i == 25 :
break
'''
0 0.7084164619445801 0.5
5 0.6241328120231628 0.75
10 0.5478470921516418 0.75
15 0.5879313945770264 0.625
20 0.4713858366012573 1.0
25 0.5577196478843689 0.75
'''
def test ( ) :
model. eval ( )
correct = 0
total = 0
loader = torch. utils. data. DataLoader( dataset= Dataset( 'test' )
, batch_size= 32
, collate_fn= collate_fn
, shuffle= True
, drop_last= True
)
for i, ( input_ids, attention_mask, token_type_ids, labels) in enumerate ( loader) :
if i == 5 :
break
print ( i)
out = model( input_ids= input_ids
, attention_mask= attention_mask
, token_type_ids= token_type_ids
)
out = out. argmax( dim= 1 )
correct += ( out == labels) . sum ( ) . item( )
total += len ( labels)
print ( correct / total)
test( )
'''
0
1
2
3
4
0.81875
'''