# 昇思25天学习打卡营第18天|LLM原理和实践-基于MindSpore通过GPT实现情感分类
数据集处理
from mindnlp. dataset import load_dataset
imdb_ds = load_dataset( 'imdb' , split= [ 'train' , 'test' ] )
imdb_train = imdb_ds[ 'train' ]
imdb_test = imdb_ds[ 'test' ]
imdb_ds_my = load_dataset( 'imdb' )
train_texts = imdb_ds[ 'text' ]
for i in range ( 5 ) :
print ( f"Sample { i+ 1 } : { train_texts[ i] } " )
train_features = imdb_ds. features
print ( f"Number of fields in the training set: { len ( train_features) } " )
print ( "Field names in the training set:" )
for field in train_features:
print ( field)
import json
import numpy as np
def process_dataset ( dataset, tokenizer, max_seq_len= 512 , batch_size= 4 , shuffle= False ) :
is_ascend = mindspore. get_context( 'device_target' ) == 'Ascend'
def tokenize ( text) :
if is_ascend:
tokenized = tokenizer( text, padding= 'max_length' , truncation= True , max_length= max_seq_len)
else :
tokenized = tokenizer( text, truncation= True , max_length= max_seq_len)
return tokenized[ 'input_ids' ] , tokenized[ 'attention_mask' ]
if shuffle:
dataset = dataset. shuffle( batch_size)
dataset = dataset. map ( operations= [ tokenize] , input_columns= "text" , output_columns= [ 'input_ids' , 'attention_mask' ] )
dataset = dataset. map ( operations= transforms. TypeCast( mindspore. int32) , input_columns= "label" , output_columns= "labels" )
if is_ascend:
dataset = dataset. batch( batch_size)
else :
dataset = dataset. padded_batch( batch_size, pad_info= { 'input_ids' : ( None , tokenizer. pad_token_id) ,
'attention_mask' : ( None , 0 ) } )
return dataset
tokenized:分词,将长文本进行分词,转换成id和输入的特征向量 transforms.TypeCast:类型转换,将label由int64 转换成int 32 符合模型的输入
from mindnlp. transformers import GPTTokenizer
cache_dir = './'
gpt_tokenizer = GPTTokenizer. from_pretrained( 'openai-gpt' , cache_dir= cache_dir)
special_tokens_dict = {
"bos_token" : "<bos>" ,
"eos_token" : "<eos>" ,
"pad_token" : "<pad>" ,
}
num_added_toks = gpt_tokenizer. add_special_tokens( special_tokens_dict)
gpt_tokenizer:使用相同的 tokenizer 可以确保在训练和推理过程中数据处理的一致性 特殊标记确保模型能够正确理解和处理输入数据的开头、结尾和填充部分
构建模型
from mindnlp. transformers import GPTForSequenceClassification
from mindspore. experimental. optim import Adam
model = GPTForSequenceClassification. from_pretrained( 'openai-gpt' , num_labels= 2 )
model. config. pad_token_id = gpt_tokenizer. pad_token_id
model. resize_token_embeddings( model. config. vocab_size + 3 )
optimizer = nn. Adam( model. trainable_params( ) , learning_rate= 2e-5 )
metric = Accuracy( )
ckpoint_cb = CheckpointCallback( save_path= 'checkpoint' , ckpt_name= 'gpt_imdb_finetune' , epochs= 1 , keep_checkpoint_max= 2 )
best_model_cb = BestModelCallback( save_path= 'checkpoint' , ckpt_name= 'gpt_imdb_finetune_best' , auto_load= True )
trainer = Trainer( network= model, train_dataset= dataset_train,
eval_dataset= dataset_train, metrics= metric,
epochs= 1 , optimizer= optimizer, callbacks= [ ckpoint_cb, best_model_cb] ,
jit= False )
GPTForSequenceClassification.from_pretrained:加载预训练模型,确定分类以及类型数目 config.pad_token_id:确保特殊标记需要在 tokenizer 和模型配置中能够正确匹配
model.config
包含了模型的各种配置信息,包括词汇表大小、词嵌入维度、特殊标记的索引等。pad_token_id
是模型配置中一个重要的参数,它表示用于填充输入序列的特殊标记 <pad>
在词汇表中的索引或 ID。 model.resize_token_embeddings():调整模型的词嵌入矩阵大小,确保它可以容纳 tokenizer 扩展后的词汇表大小。这里 model.config.vocab_size
是模型配置中的词汇表大小,+3
是因为我们添加了三个特殊标记 <bos>
、<eos>
和 <pad>
。vocab_size: Vocabulary size of inputs_ids in BertModel. 词汇表大小。
模型训练
from mindnlp. _legacy. engine import Trainer, Evaluator
evaluator = Evaluator( network= model, eval_dataset= dataset_test, metrics= metric)
evaluator. run( tgt_columns= "labels" )
evaluator.run(tgt_columns=“labels”):评估器将使用 dataset_test
中的 "labels"
列来评估模型的性能。
模型体验
数据集处理
from mindnlp. dataset import load_dataset
imdb_ds = load_dataset( 'imdb' , split= [ 'train' , 'test' ] )
imdb_train = imdb_ds[ 'train' ]
imdb_test = imdb_ds[ 'test' ]
imdb_ds_my = load_dataset( 'imdb' )
train_texts = imdb_ds[ 'text' ]
for i in range ( 5 ) :
print ( f"Sample { i+ 1 } : { train_texts[ i] } " )
train_features = imdb_ds. features
print ( f"Number of fields in the training set: { len ( train_features) } " )
print ( "Field names in the training set:" )
for field in train_features:
print ( field)
import json
import numpy as np
def process_dataset ( dataset, tokenizer, max_seq_len= 512 , batch_size= 4 , shuffle= False ) :
is_ascend = mindspore. get_context( 'device_target' ) == 'Ascend'
def tokenize ( text) :
if is_ascend:
tokenized = tokenizer( text, padding= 'max_length' , truncation= True , max_length= max_seq_len)
else :
tokenized = tokenizer( text, truncation= True , max_length= max_seq_len)
return tokenized[ 'input_ids' ] , tokenized[ 'attention_mask' ]
if shuffle:
dataset = dataset. shuffle( batch_size)
dataset = dataset. map ( operations= [ tokenize] , input_columns= "text" , output_columns= [ 'input_ids' , 'attention_mask' ] )
dataset = dataset. map ( operations= transforms. TypeCast( mindspore. int32) , input_columns= "label" , output_columns= "labels" )
if is_ascend:
dataset = dataset. batch( batch_size)
else :
dataset = dataset. padded_batch( batch_size, pad_info= { 'input_ids' : ( None , tokenizer. pad_token_id) ,
'attention_mask' : ( None , 0 ) } )
return dataset
tokenized:分词,将长文本进行分词,转换成id和输入的特征向量 transforms.TypeCast:类型转换,将label由int64 转换成int 32 符合模型的输入
from mindnlp. transformers import GPTTokenizer
cache_dir = './'
gpt_tokenizer = GPTTokenizer. from_pretrained( 'openai-gpt' , cache_dir= cache_dir)
special_tokens_dict = {
"bos_token" : "<bos>" ,
"eos_token" : "<eos>" ,
"pad_token" : "<pad>" ,
}
num_added_toks = gpt_tokenizer. add_special_tokens( special_tokens_dict)
gpt_tokenizer:使用相同的 tokenizer 可以确保在训练和推理过程中数据处理的一致性 特殊标记确保模型能够正确理解和处理输入数据的开头、结尾和填充部分
构建模型
from mindnlp. transformers import GPTForSequenceClassification
from mindspore. experimental. optim import Adam
model = GPTForSequenceClassification. from_pretrained( 'openai-gpt' , num_labels= 2 )
model. config. pad_token_id = gpt_tokenizer. pad_token_id
model. resize_token_embeddings( model. config. vocab_size + 3 )
optimizer = nn. Adam( model. trainable_params( ) , learning_rate= 2e-5 )
metric = Accuracy( )
ckpoint_cb = CheckpointCallback( save_path= 'checkpoint' , ckpt_name= 'gpt_imdb_finetune' , epochs= 1 , keep_checkpoint_max= 2 )
best_model_cb = BestModelCallback( save_path= 'checkpoint' , ckpt_name= 'gpt_imdb_finetune_best' , auto_load= True )
trainer = Trainer( network= model, train_dataset= dataset_train,
eval_dataset= dataset_train, metrics= metric,
epochs= 1 , optimizer= optimizer, callbacks= [ ckpoint_cb, best_model_cb] ,
jit= False )
GPTForSequenceClassification.from_pretrained:加载预训练模型,确定分类以及类型数目 config.pad_token_id:确保特殊标记需要在 tokenizer 和模型配置中能够正确匹配
model.config
包含了模型的各种配置信息,包括词汇表大小、词嵌入维度、特殊标记的索引等。pad_token_id
是模型配置中一个重要的参数,它表示用于填充输入序列的特殊标记 <pad>
在词汇表中的索引或 ID。 model.resize_token_embeddings():调整模型的词嵌入矩阵大小,确保它可以容纳 tokenizer 扩展后的词汇表大小。这里 model.config.vocab_size
是模型配置中的词汇表大小,+3
是因为我们添加了三个特殊标记 <bos>
、<eos>
和 <pad>
。vocab_size: Vocabulary size of inputs_ids in BertModel. 词汇表大小。
模型训练
from mindnlp. _legacy. engine import Trainer, Evaluator
evaluator = Evaluator( network= model, eval_dataset= dataset_test, metrics= metric)
evaluator. run( tgt_columns= "labels" )
evaluator.run(tgt_columns=“labels”):评估器将使用 dataset_test
中的 "labels"
列来评估模型的性能。