huggingface

青椒鸡汤

已于 2023-02-20 13:53:48 修改

阅读量1k

点赞数

分类专栏： python学习文章标签： python 机器学习开发语言

于 2023-02-18 16:02:55 首次发布

好

本文链接：https://blog.csdn.net/dfhg54/article/details/129065878

版权

python学习专栏收录该内容

12 篇文章 1 订阅

订阅专栏

HuggingFace - 简明教程_huggingface使用教程_伊织code的博客-CSDN博客

BERT中的Tokenizer说明_xuanningmeng的博客-CSDN博客_berttokenizer

HuggingFace简明教程_weixin_44748589的博客-CSDN博客_huggingface

from transformers import BertTokenizer

# 加载预训练字典和分词方法
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

tokenizer, sents
'''
from transformers import BertTokenizer

# 加载预训练字典和分词方法
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

tokenizer, sents
'''


# 编码两个句子
out = tokenizer.encode(
    text=sents[0],
    text_pair=sents[1],

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补pad到max_length长度
    padding='max_length', # 最大长度内补pad
    add_special_tokens=True, # pad 是添加特殊符号
    max_length=30, # 超过最大长度截断
    return_tensors=None, # 不指定类型，默认返回 list
)

print(out)
'''
[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
'''

tokenizer.decode(out)
'''
'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'
'''


# 增强的编码函数
out = tokenizer.encode_plus(
    text=sents[0],
    text_pair=sents[1],

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补零到max_length长度
    padding='max_length',
    max_length=30,
    add_special_tokens=True,

    # 可取值tf,pt,np,默认为返回list
    return_tensors=None,

    # 返回token_type_ids
    return_token_type_ids=True,

    # 返回attention_mask
    return_attention_mask=True,

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask=True,

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    # 返回length 标识长度
    return_length=True,
)

# input_ids 就是编码后的词
# token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
# special_tokens_mask 特殊符号的位置是1,其他位置是0
# attention_mask pad的位置是0,其他位置是1
# length 返回句子长度
for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'])
'''
input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
length : 30
'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'
'''


# 批量编码句子
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[sents[0], sents[1]],
    add_special_tokens=True,

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补零到max_length长度
    padding='max_length',
    max_length=15,

    # 可取值tf,pt,np,默认为返回list
    return_tensors=None,

    # 返回token_type_ids
    return_token_type_ids=True,

    # 返回attention_mask
    return_attention_mask=True,

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask=True,

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    # 返回length 标识长度
    return_length=True,
)

# input_ids 就是编码后的词
# token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
# special_tokens_mask 特殊符号的位置是1,其他位置是0
# attention_mask pad的位置是0,其他位置是1
# length 返回句子长度
for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'][0]), tokenizer.decode(out['input_ids'][1])
'''
input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]
length : [15, 12]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
('[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 [SEP]',
 '[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]')
'''



# 批量编码成对的句子
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[(sents[0], sents[1]), (sents[2], sents[3])],
    add_special_tokens=True,

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补零到max_length长度
    padding='max_length',
    max_length=30,

    # 可取值tf,pt,np,默认为返回list
    return_tensors=None,

    # 返回token_type_ids
    return_token_type_ids=True,

    # 返回attention_mask
    return_attention_mask=True,

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask=True,

    # 返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    # return_offsets_mapping=True,

    # 返回length 标识长度
    return_length=True,
)

# input_ids 就是编码后的词
# token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
# special_tokens_mask 特殊符号的位置是1,其他位置是0
# attention_mask pad的位置是0,其他位置是1
# length 返回句子长度
for k, v in out.items():
    print(k, ':', v)

tokenizer.decode(out['input_ids'][0]) #解码input_ids对应的token
print('sad    ',out['input_ids'][1]) #3,4句话的id
print(tokenizer.decode(out['input_ids'][1]))
'''
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0], [101, 2791, 7313, 1922, 2207, 511, 1071, 800, 4638, 6963, 671, 5663, 511, 102, 791, 1921, 2798, 4761, 6887, 6821, 741, 6820, 3300, 5018, 127, 1318, 117, 4696, 3300, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]
length : [27, 30]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'
'''


# 获取字典
zidian = tokenizer.get_vocab()

type(zidian), len(zidian), '月光' in zidian, # (dict, 21128, False)


# 添加新词
tokenizer.add_tokens(new_tokens=['月光', '希望'])

# 添加新符号
tokenizer.add_special_tokens({'eos_token': '[EOS]'})

zidian = tokenizer.get_vocab()

type(zidian), len(zidian), zidian['月光'], zidian['[EOS]'] # (dict, 21131, 21128, 21130)


# 编码新添加的词
out = tokenizer.encode(
    text='月光的新希望[EOS]',
    text_pair=None,

    # 当句子长度大于max_length时,截断
    truncation=True,

    # 一律补pad到max_length长度
    padding='max_length',
    add_special_tokens=True,
    max_length=8,
    return_tensors=None,
)

print(out) # [101, 21128, 4638, 3173, 21129, 21130, 102, 0]


tokenizer.decode(out) # '[CLS] 月光 的 新 希望 [EOS] [SEP] [PAD]'
print(tokenizer.decode(out))

数据集


from datasets import load_dataset

# 加载数据
dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')
dataset
'''
Using custom data configuration default
Reusing dataset chn_senti_corp (/Users/lee/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)
Dataset({
    features: ['text', 'label'],
    num_rows: 9600
})
'''


# 查看一个数据
dataset[0]
print(dataset[0])
'''
{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
 'label': 1}
'''


# sort

# 未排序的label是乱序的
print('未排序的label是乱序的  ',dataset['label'][:10])

# 排序之后label有序了
sorted_dataset = dataset.sort('label')
print(sorted_dataset['label'][:10])
print(sorted_dataset['label'][-10:])


# shuffle

# 打乱顺序
shuffled_dataset = sorted_dataset.shuffle(seed=42)
shuffled_dataset['label'][:10]
print('打乱顺序  ',shuffled_dataset['label'][:10])


# select
dataset.select([0, 10, 20, 30, 40, 50])
print('选择 ',dataset.select([0, 10, 20, 30, 40, 50]) )

# filter
def f(data):  #函数将选定的数据集中有’选择‘这两个字的文本挑出来
    return data['text'].startswith('选择')

start_with_ar = dataset.filter(f) #数据进行过滤后，得出数据长度并重命名为text

len(start_with_ar), start_with_ar['text']
print('数据长度',len(start_with_ar)) #当前数据集只有两句话符合要求
print(start_with_ar[1])  #可查看filter后的文本


# train_test_split, 切分训练集和测试集
dataset.train_test_split(test_size=0.1)

# shard 把数据切分到4个桶中,均匀分配
dataset.shard(num_shards=4, index=0)

# 列操作和类型转换
# rename_column 列重命名
dataset.rename_column('text', 'textA')

# remove_columns 列移除
dataset.remove_columns(['text'])

# map 把每一个数据输入到 lamda 函数，操作返回
def f(data):
    data['text'] = 'My sentence: ' + data['text']
    return data

datatset_map = dataset.map(f)   # map()方法将转换函数依次应用于输入Dataset中的每一个元素（element），返回结果是转换后的新Dataset
datatset_map['text'][:5]

# set_format
dataset.set_format(type='torch', columns=['label'])
dataset[0  ]  # {'label': tensor(1)}

# 保存和加载
from datasets import load_from_disk

dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')

# 保存和加载
dataset.save_to_disk("./")
dataset = load_from_disk("./")
dataset

# 导出为其他格式
# dataset.to_csv('./datasets.csv')
# dataset.to_json('./datasets.json')

from datasets import list_metrics

# 列出评价指标
metrics_list = list_metrics()
len(metrics_list), metrics_list
'''
(34,
 ['accuracy',
  'bertscore',
  'bleu',
  'bleurt',
  'cer',
  'chrf',
  'code_eval',
  'comet',
  'competition_math',
  'coval',
  'cuad',
  'f1',
  'gleu',
  'glue',
  'google_bleu',
'''
from datasets import load_metric

# 加载一个评价指标
metric = load_metric('glue', 'mrpc')

print(metric.inputs_description)  # 打印使用文档

# 计算一个评价指标
predictions = [0, 1, 0]
references = [0, 1, 1]

final_score = metric.compute(predictions=predictions, references=references)
final_score  # {'accuracy': 0.6666666666666666, 'f1': 0.6666666666666666}
print(final_score)

【PyTorch】torch.utils.data.Dataset 介绍与实战_想变厉害的大白菜的博客-CSDN博客

什么是张量（Tensor）_张量是什么意思_醉Dowry的博客-CSDN博客

【PyTorch】torch.mean(), dim=0, dim=1 详解_行路南的博客-CSDN博客_mean(dim=0)

nn.Softmax(dim) 的理解_Enterings的博客-CSDN博客_nn.softmax

这几个博客中的行，列可根据自己的排列形式自由理解

情感分析（注意看model处的注释）

import torch
from datasets import load_dataset
import time #时间


start =time.time()
# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split): #初始化读取数据，也可用pd.read_csv来读取csv文件
        self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split) #这里是把整个数据集传入

    def __len__(self):
        return len(self.dataset)  #返回的是数据集的大小

    def __getitem__(self, i):    #实现索引数据集中的某一个数据
        text = self.dataset[i]['text']   #根据i的数值，从数据集中将每个索引对应的数据取出
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')

len(dataset), dataset[0]
print(len(dataset)) #数据集长度
print(dataset[0])  #第一句话和标签

from transformers import BertTokenizer # 加载分词工具（需要和与预训练模型相匹配）

# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
# PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

#批处理函数
def collate_fn(data):
    sents = [i[0] for i in data]  #i由batch_size决定
    labels = [i[1] for i in data]

    # 编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',  #pytorch
                                   return_length=True)

    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn, #批处理函数
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break     #每个批次取出这些参数

print(len(loader)) # 600  一共9600句，每次16，所以最后为600
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels
'''
(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]))
'''


from transformers import AlbertModel  #这里也可以用ALBERT，但效果只有0.63

# 加载预训练模型
pretrained = AlbertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():   #这里预训练模型的参数不做训练
    param.requires_grad_(False)

# 模型试算
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

out.last_hidden_state.shape # torch.Size([16, 500, 768])  16是一个 batch-size，对应数据中的16句话；指定每一句话编码成 500个词的长度， 768 是词编码的维度；


# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2) # 全连接层，此处做的是2分类，想做n分类，改成n即可。

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0]) # 取第0个特征（这个和bert 的设计思路有关）
        #print('前',out.shape)
        out = out.softmax(dim=1)  # torch.Size([16, 2])，16是batch_size，所以dim=1
        #print('softmax后', out.shape)
        return out

model = Model()

model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape
print(model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape) #shape是查看矩阵或者数组的维数，当前为torch.Size([16, 2])，经过全连接层后
                                            #维度变为2

from transformers import AdamW

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()  #启用 batch normalization 和 dropout
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):   #批次
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)   #训练

    loss = criterion(out, labels)  #损失
    loss.backward()
    optimizer.step()     #优化
    optimizer.zero_grad()

    if i % 5 == 0: #每做5个批次，就显示一次准确值
        out = out.argmax(dim=1) #筛选出最大的那个概率值
        accuracy = (out == labels).sum().item() / len(labels)

        print(i, loss.item(), accuracy)

    if i == 300:   #只运行300批次
        break


# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()


end=time.time()
print('Running time: %s Seconds'%(end-start))

中文填空

torch.zeros() 函数详解_Vertira的博客-CSDN博客_torch.zeros

import torch
from datasets import load_dataset

# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

        def f(data):
            return len(data['text']) > 30    #过滤掉文本长度小于30的

        self.dataset = dataset.filter(f)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']       #不需要数据集中的label

        return text


dataset = Dataset('train')

len(dataset), dataset[0]


from transformers import BertTokenizer

# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')


def collate_fn(data):
    # 编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=data,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=30,
                                   return_tensors='pt',
                                   return_length=True)

    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']

    # 把第15个词固定替换为mask
    labels = input_ids[:, 15].reshape(-1).clone()    #第15个字都变为label，作对比
    input_ids[:, 15] = token.get_vocab()[token.mask_token] #mask第15个字

    # print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(token.decode(input_ids[0]))
print(token.decode(labels[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape


from transformers import BertModel

# 加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

# 模型试算
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

out.last_hidden_state.shape


# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = torch.nn.Linear(768, token.vocab_size, bias=False)  #token.vocab_size是因为候选的字就是字库里字的数量
        self.bias = torch.nn.Parameter(torch.zeros(token.vocab_size))
        self.decoder.bias = self.bias

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        out = self.decoder(out.last_hidden_state[:, 15]) #只取第15

        return out


model = Model()

model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape


from transformers import AdamW

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(5): #所有批次循环5次
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader):
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 50 == 0:
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)

            print(epoch, i, loss.item(), accuracy)


# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 15:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

        print(token.decode(input_ids[0]))
        print(token.decode(labels[0]), token.decode(labels[0]))

    print(correct / total)


test()

关系推断

import torch
from datasets import load_dataset
import random


# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

        def f(data):
            return len(data['text']) > 40

        self.dataset = dataset.filter(f)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']

        # 切分一句话为前半句和后半句
        sentence1 = text[:20]
        sentence2 = text[20:40]
        label = 0     #句子相连标为0

        # 有一半的概率把后半句替换为一句无关的话
        if random.randint(0, 1) == 0:
            j = random.randint(0, len(self.dataset) - 1)  #从整个数据集中随机算出一句话
            sentence2 = self.dataset[j]['text'][20:40]    #用随机选出句子的第20~40字来替换下半句话
            label = 1

        return sentence1, sentence2, label


dataset = Dataset('train')

sentence1, sentence2, label = dataset[0]

len(dataset), sentence1, sentence2, label

from transformers import BertTokenizer

# 加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')


def collate_fn(data):
    sents = [i[:2] for i in data]  #取前两行的句子
    labels = [i[2] for i in data]  #前两个标签

    # 编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=45,
                                   return_tensors='pt',
                                   return_length=True,
                                   add_special_tokens=True)

    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    # token_type_ids:第一个句子和特殊符号的位置是0,第二个句子的位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    # print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(token.decode(input_ids[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

from transformers import AlbertModel

# 加载预训练模型
pretrained = AlbertModel.from_pretrained('bert-base-chinese')

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

# 模型试算
out = pretrained(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)

print(out.last_hidden_state.shape)


# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2) #二分类

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0])
        out = out.softmax(dim=1)
        return out


model = Model()
model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape

from transformers import AdamW

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        print(i, loss.item(), accuracy)

    if i == 300:
        break


# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        if i == 5:
            break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        pred = out.argmax(dim=1)  #预测值

        correct += (pred == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

https://shijianfeng.blog.csdn.net/article/details/128331520?spm=1001.2101.3001.6650.1&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1-128331520-blog-110819317.pc_relevant_3mothn_strategy_recovery&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-1-128331520-blog-110819317.pc_relevant_3mothn_strategy_recovery&utm_relevant_index=2