从零开始训练一个GPT大语言模型

安装模块

!pip install tokenizers==0.13.3 torch==2.0.1 transformers==4.30.2
!pip install accelerate -U

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from transformers import pipeline, set_seed
from transformers import GPT2TokenizerFast
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments     

零、数据准备

import requests
from bs4 import BeautifulSoup


url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37'
}
page_text_res = requests.get(url=url, headers=headers)
page_text_res.encoding = 'utf-8'
page_text = page_text_res.text
soup = BeautifulSoup(page_text, 'lxml')
select_list = soup.select('.book-mulu > ul > li > a')
url_assist = 'https://www.shicimingju.com'

fp = open('./sanguoyanyi.txt', 'w', encoding='utf-8')
for select in select_list:
    title = select.string
    chapter_url = url_assist + select['href']
    res = requests.get(url=chapter_url, headers=headers)
    res.encoding = 'utf-8'
    text = res.text
    chapter_soup = BeautifulSoup(text, 'lxml')
    chapter_content = chapter_soup.find('div', class_='chapter_content').text
    fp.write(title+':'+chapter_content+'\n')
    # print(title, "爬取成功")
    # print(select.string, '链接:', url_assist + select['href'], '爬取成功')
print('over')

over

一、训练分词器

分词(tokenization) 是把输入文本切分成有意义的子单元(tokens)。 通过以下代码,根据我们的数据一个新的分词器:

tokenizer = Tokenizer(BPE(unk_token=""))
tokenizer.normalizer = Sequence([NFKC()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

special_tokens = ["","","","","
('./sanguo/tokenizer_config.json',
 './sanguo/special_tokens_map.json',
 './sanguo/vocab.json',
 './sanguo/merges.txt',
 './sanguo/added_tokens.json',
 './sanguo/tokenizer.json')

二、训练模型

tokenizer = GPT2Tokenizer.from_pretrained("./sanguo")
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

# 配置GPT2模型参数
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)

# 创建模型
model = GPT2LMHeadModel(config)

# 训练数据我们用按行分割
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./sanguoyanyi.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, mlm_probability=0.15
)

# 配置训练参数
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_gpu_train_batch_size=16,
    save_steps=2000,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
trainer.train()

# 保存模型
model.save_pretrained('./sanguo')

三、test model

generator = pipeline('text-generation', model='./sanguo')
set_seed(13)
txt = generator("吕布", max_length=10)
print(txt)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting pad_token_id to eos_token_id:2 for open-end generation.
[{‘generated_text’: ‘吕布回·曹操怒�\xa0却说姜维’}]

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值