这里写目录标题
引入包
import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
加载预训练模型
from transformers import BertTokenizer,BertForSequenceClassification,BertConfig
config=BertConfig.from_pretrained("D:\\jpdir\\bert\\bertchinese",num_labels=10)
tokenizer = BertTokenizer.from_pretrained("D:\\jpdir\\bert\\bertchinese")
model = BertForSequenceClassification.from_pretrained("D:\\jpdir\\bert\\bertchinese",config=config)
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint8 = np.dtype([("qint8", np.int8, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint8 = np.dtype([("quint8", np.uint8, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint16 = np.dtype([("qint16", np.int16, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_quint16 = np.dtype([("quint16", np.uint16, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
_np_qint32 = np.dtype([("qint32", np.int32, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
np_resource = np.dtype([("resource", np.ubyte, 1)])
加载数据文件
# 整理训练数据
x_train=[]
x_test=[]
with open("D:\\jpdir\\bert\\bertdata\\Multi-classification\\train.txt","r",encoding="utf-8") as f:
lines=f.readlines()
for line in lines:
x_train.append(line.split("\t")[0])
x_test.append(line.split("\t")[1].replace("\n",""))
# 整理测试数据
y_train=[]
y_test=[]
with open("D:\\jpdir\\bert\\bertdata\\Multi-classification\\test.txt","r",encoding="utf-8") as f:
lines=f.readlines()
for line in lines:
y_train.append(line.split("\t")[0])
y_test.append(line.split("\t")[1].replace("\n",""))
定义数据
class CustomDataset(Dataset):
def __init__(self,data_path):
# 初始化数据集的过程,例如加载数据等
# 假设我们有一个数据列表
self.data = []
with open(data_path,"r",encoding="utf-8") as f:
lines=f.readlines()
for line in lines:
self.data.append(line)
def __len__(self):
# 返回数据集的长度
return len(self.data)
def __getitem__(self, index):
# 根据索引获取一个样本
line=self.data[index]
content=line.split("\t")[0]
label=line.split("\t")[1].replace("\n","").replace("\"","")
return content,label
实例化数据集
train_data= CustomDataset("D:\\jpdir\\bert\\bertdata\\Multi-classification\\train.txt")
test_data= CustomDataset("D:\\jpdir\\bert\\bertdata\\Multi-classification\\test.txt")
len(train_data),len(test_data)
(4610, 4768)
使用loader加载数据
设定最大句子长度
maxlenhth=32
定义加padding的函数
不够maxlength,就加pad,这的pad对应的索引是0
def add_padding(data):
if len(data)<maxlenhth:
for x in torch.arange(maxlenhth-len(data)):
data.append(0)
return data
定义加collate_fn函数
这里处理tokenizer和paading
def collate_fn(batchData,tokenizer):
scentence=[line[0] for line in batchData]
label=[int(line[1]) for line in batchData]
scentence=torch.tensor([add_padding(tokenizer.encode(one,max_length=32,add_special_tokens=True)) for one in scentence])
label=torch.tensor(label)
return scentence,label
使用DataLoader加载数据
loader = DataLoader(train_data, 5, shuffle=True,collate_fn=lambda x:collate_fn(x,tokenizer))
data_iter = iter(loader)
print(len(data_iter))
# 看下数据
data = next(data_iter)
"长度:",len(data[0]),"data[0]:",data[0],"data[1]:",data[1],"data:",data,data[0].size(),data[1].unsqueeze(1).size()
922
('长度:',
5,
'data[0]:',
tensor([[ 101, 517, 682, 1957, 3187, 3127, 518, 3119, 6228, 1086, 1932, 1094,
3209, 3241, 677, 4028, 1920, 5310, 2229, 102, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 2349, 7561, 680, 2357, 3306, 2199, 6158, 5739, 1744, 1957, 4374,
2970, 6224, 1217, 2135, 2196, 4265, 2900, 3189, 1377, 2521, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 4242, 6946, 3215, 3777, 9560, 7555, 4680, 8183, 2398, 6629, 122,
118, 124, 2233, 1762, 1545, 1059, 3621, 8380, 2835, 102, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 3791, 1744, 8226, 674, 782, 7770, 5440, 868, 3152, 1091, 100,
3152, 1265, 3221, 1415, 886, 782, 2814, 3289, 100, 102, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 517, 7987, 722, 6484, 518, 100, 100, 2845, 1399, 2661, 5683,
2458, 1423, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]]),
'data[1]:',
tensor([9, 7, 1, 3, 8]),
'data:',
(tensor([[ 101, 517, 682, 1957, 3187, 3127, 518, 3119, 6228, 1086, 1932, 1094,
3209, 3241, 677, 4028, 1920, 5310, 2229, 102, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 2349, 7561, 680, 2357, 3306, 2199, 6158, 5739, 1744, 1957, 4374,
2970, 6224, 1217, 2135, 2196, 4265, 2900, 3189, 1377, 2521, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 4242, 6946, 3215, 3777, 9560, 7555, 4680, 8183, 2398, 6629, 122,
118, 124, 2233, 1762, 1545, 1059, 3621, 8380, 2835, 102, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 3791, 1744, 8226, 674, 782, 7770, 5440, 868, 3152, 1091, 100,
3152, 1265, 3221, 1415, 886, 782, 2814, 3289, 100, 102, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 517, 7987, 722, 6484, 518, 100, 100, 2845, 1399, 2661, 5683,
2458, 1423, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]]),
tensor([9, 7, 1, 3, 8])),
torch.Size([5, 32]),
torch.Size([5, 1]))
定义模型
测试预训练模型输出
BertForSequenceClassification的输入input_ids size是[batch_size,maxlength],labels的size是[batch_size,1]
input_ids 是中文转成设定的数字
lables是数据的分类标签
测试预训练模型输出
loss 损失值
logits 概率分布
input_ids = torch.tensor(tokenizer.encode("词汇阅读是关键 08年考研暑期英语复习全指南",max_length=32,add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
print(outputs)
loss, logits = outputs
loss, logits
(tensor(2.2565, grad_fn=<NllLossBackward0>), tensor([[ 0.5478, -0.0462, -0.2125, -0.8165, 0.1208, -0.4684, -0.9593, 0.4391,
0.1320, -1.0400]], grad_fn=<AddmmBackward0>))
(tensor(2.2565, grad_fn=<NllLossBackward0>),
tensor([[ 0.5478, -0.0462, -0.2125, -0.8165, 0.1208, -0.4684, -0.9593, 0.4391,
0.1320, -1.0400]], grad_fn=<AddmmBackward0>))
定义自己的模型
# Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model1 = NeuralNetwork()
print(model1)
NeuralNetwork(
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear_relu_stack): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=10, bias=True)
)
)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for i,batch in enumerate(loader):
optimizer.zero_grad()
scentenses,labels=batch
output=model(scentenses,labels=labels.unsqueeze(1))
loss,logits=output
loss.backward()
optimizer.step()
print(i,loss.item())
参考
https://blog.51cto.com/u_15127680/3841198