想用开源模型跑自己的数据集，但出现无属性的问题

最新推荐文章于 2024-04-04 09:51:01 发布

冬日暖阳007

最新推荐文章于 2024-04-04 09:51:01 发布

阅读量234

点赞数

分类专栏： AI大模型文章标签： python

本文链接：https://blog.csdn.net/m0_46861633/article/details/132309235

版权

AI大模型专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1、我的设想是用开源模型跑自己的数据集训练模型
2、我是用跑通bert-base-chinese模型的训练代码，中间把模型替换成了Chinese_Chat_T5_Base
3、报错如下：has no attribute ‘batch_encode_plus’

C:\Users\Admin\anaconda3\envs\hug_gpu2_fenlei\python.exe C:/Users/Admin/PycharmProjects/hug3/test2/hug/data/chat5_muban.py
=======================#1.配置本地模型===========================
=======================#配置本地模型完成===========================
=======================#2.测试/试编码句子===========================
=======================#测试/试编码句子===========================
=======================#3.第7章/试编码句子===========================
Traceback (most recent call last):
  File "C:\Users\Admin\PycharmProjects\hug3\test2\hug\data\chat5_muban.py", line 34, in <module>
    out = token.batch_encode_plus(
  File "C:\Users\Admin\anaconda3\envs\hug_gpu2_fenlei\lib\site-packages\torch\nn\modules\module.py", line 778, in __getattr__
    raise ModuleAttributeError("'{}' object has no attribute '{}'".format(
torch.nn.modules.module.ModuleAttributeError: 'T5ForConditionalGeneration' object has no attribute 'batch_encode_plus'

Process finished with exit code 1

4、源代码如下


import numpy as np
import torch
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, BertForNextSentencePrediction
from transformers import BertModel
import os

import numpy as np
import torch
# from transformers import BertTokenizer, BertConfig, BertForMaskedLM, BertForNextSentencePrediction
from transformers import BertModel
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
import torch
from torch import cuda
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

print("=======================#1.配置本地模型===========================")
model_name = 'Chinese_Chat_T5_Base'
MODEL_PATH = 'C:/bert/Chinese_Chat_T5_Base/'
token = AutoModelForSeq2SeqLM.from_pretrained("C:/bert/Chinese_Chat_T5_Base/")
model2 = AutoModelForSeq2SeqLM.from_pretrained("C:/bert/Chinese_Chat_T5_Base/")

print("=======================#配置本地模型完成===========================")

print("=======================#2.测试/试编码句子===========================")
# model2 = BertModel.from_pretrained("bert-base-uncased",output_hidden_states=True)
#
# out2=model2.batc
print("=======================#测试/试编码句子===========================")


print("=======================#3.第7章/试编码句子===========================")

out = token.batch_encode_plus(
    batch_text_or_text_pairs=['从明天起，做一个幸福的人。', '喂马，劈柴，周游世界。'],
    truncation=True,
    padding='max_length',
    max_length=17,
    return_tensors='pt',
    return_length=True
)
# print(out)
# print(out.last_hidden_state)
#查看编码输出
# for k, v in out.items():
#     print(k, v.shape)

#把编码还原为句子
# print(token.decode(out['input_ids'][0]))
print("=======================#3.第7章/试编码句子完毕===========================")
print("=======================#4.第7章/定义数据集===========================")
import torch
from datasets import load_from_disk


class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_from_disk('../data/ChnSentiCorp')[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')

len(dataset), dataset[20]
print("=======================#4.第7章/定义数据集完毕===========================")
print("=======================#5.第7章/定义计算设备===========================")
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

print('we will use the GPU: ', torch.cuda.get_device_name(0))
print("=======================#5.第7章/定义计算设备完毕===========================")
print("=======================#6.第7章/数据整理函数===========================")
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码

    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #把数据移动到计算设备上
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)

    return input_ids, attention_mask, token_type_ids, labels
print("=======================#6.第7章/数据整理函数完毕===========================")
print("=======================#7.第7章/数据整理函数试算===========================")
#模拟一批数据
data = [
    ('你站在桥上看风景', 1),
    ('看风景的人在楼上看你', 0),
    ('明月装饰了你的窗子', 1),
    ('你装饰了别人的梦', 0),
]

#试算
input_ids, attention_mask, token_type_ids, labels = collate_fn(data)

input_ids.shape, attention_mask.shape, token_type_ids.shape, labels
print("=======================#7.第7章/数据整理函数试算完毕===========================")
print("=======================#8.第7章/数据加载器===========================")
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

len(loader)
print("=======================#8.第7章/数据加载器完毕===========================")
print("=======================#9.第7章/查看数据样例===========================")
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

input_ids.shape, attention_mask.shape, token_type_ids.shape, labels
print("=======================#9.第7章/查看数据样例完毕===========================")
print("=======================#10.第7章/加载预训练模型===========================")
from transformers import BertModel
print("===============如何设置本地模型？========#第7章/下载预训练模型===========================")
pretrained = BertModel.from_pretrained(model_name)
# model预训练模型
model2 = AutoModelForSeq2SeqLM.from_pretrained("C:/bert/Chinese_Chat_T5_Base/")
#统计参数量
sum(i.numel() for i in pretrained.parameters()) / 10000
print("=======================#10.第7章/加载预训练模型完毕===========================")
print("=======================#11.第7章/不训练预训练模型,不需要计算梯度===========================")
for param in pretrained.parameters():
    param.requires_grad_(False)
print("=======================#11.第7章/不训练预训练模型,不需要计算梯度完毕===========================")
print("=======================#12.第7章/预训练模型试算===========================")
#设定计算设备
pretrained.to(device)
# model训练模型试算
model2.to(device)

#模型试算
out = pretrained(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)

out.last_hidden_state.shape
# out.last_hidden_state.shape
# print(out.last_hidden_state.shape)
# "Traceback (most recent call last):
#   File "C:/Users/Admin/PycharmProjects/hug3/test2/hug/data/fenlei.py", line 136, in <module>
#     out.last_hidden_state.shape
# AttributeError: 'tuple' object has no attribute 'last_hidden_state'"
print("=======================#12.第7章/预训练模型试算完毕===========================")
print("=======================#13.第7章/定义下游任务模型===========================")
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(in_features=768, out_features=2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        #对抽取的特征只取第一个字的结果做分类即可
        out = self.fc(out.last_hidden_state[:, 0])

        out = out.softmax(dim=1)

        return out


model = Model()

#设定计算设备
model.to(device)

#试算
model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape
print("=======================#13.第7章/定义下游任务模型完毕===========================")
print("=======================#14.第7章/训练===========================")
from transformers import AdamW
# from transformers.optimization import get_constant_schedule
from transformers.optimization import get_scheduler


def train():
    #定义优化器
    optimizer = AdamW(model.parameters(), lr=5e-4)

    #定义loss函数
    criterion = torch.nn.CrossEntropyLoss()

    #定义学习率调节器
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    #模型切换到训练模式
    model.train()

    #按批次遍历训练集中的数据
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader):

        #模型计算
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

        #计算loss并使用梯度下降法优化模型参数
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        #输出各项数据的情况，便于观察
        if i % 10 == 0:
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            print(i, loss.item(), lr, accuracy)


train()
print("=======================14.#第7章/训练完毕===========================")

print("=======================15.#第7章/测试===========================")
def test():
    #定义测试数据集加载器
    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    #下游任务模型切换到运行模式
    model.eval()
    correct = 0
    total = 0

    #按批次遍历测试集中的数据
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        #计算5个批次即可，不需要全部遍历
        if i == 5:
            break

        print(i)

        #计算
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        #统计正确率
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()
print("=======================#15.第7章/测试完毕===========================")

冬日暖阳007

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
想用开源模型跑自己的数据集，但出现无属性的问题

1、设想用开源模型跑自己的数据集2、我是用跑通bert-base-chinese模型的训练代码，中间把模型替换成了Chinese_Chat_T5_Base。3、报错如下：has no attribute ‘batch_encode_plus’1、我的设想是用开源模型跑自己的数据集训练模型。
复制链接

扫一扫

专栏目录