随着BERT大火之后,很多BERT的变种,这里借用Huggingface工具来简单实现一个文本分类,从而进一步通过Huggingface来认识BERT的工程上的实现方法。
1、load data
- train_df = pd.read_csv('../data/train.tsv',delimiter='t',names=['text','label'])
- print(train_df.shape)
- train_df.head()
- sentences = list(train_df['text'])
- targets =train_df['label'].values
2、token encodding
- #如果token要封装到自定义model类中的话,则需要指定max_len
- tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
- max_length=32
- sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt')
- targets=torch.tensor(targets)
3、encoding data
- # from torchvision import transforms,datasets
- from torch.utils.data import Dataset,DataLoader,random_split
- class DataToDataset(Dataset):
- def __init__(self,encoding,labels):
- self.encoding=encoding
- self.labels=labels
- def __len__(self):
- return len(self.labels)
- def __getitem__(self,index):
- return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index]
- #封装数据
- datasets=DataToDataset(sentences_tokened,targets)
- train_size=int(len(datasets)*0.8)
- test_size=len(datasets)-train_size
- print([train_size,test_size])
- train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size])
- BATCH_SIZE=64
- #这里的num_workers要大于0
- train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)
- val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)#
4、create model
- class BertTextClassficationModel(nn.Module):
- def __init__(self):
- super(BertTextClassficationModel,self).__init__()
- self.bert=BertModel.from_pretrained('bert-base-uncased')
- self.dense=nn.Linear(768,2) #768 input, 2 output
- def forward(self,ids,mask):
- out,_=self.bert(input_ids=ids,attention_mask=mask)
- out=self.dense(out[:,0,:])
- return out
- mymodel=BertTextClassficationModel()
- #获取gpu和cpu的设备信息
- device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
- print("device=",device)
- if torch.cuda.device_count()>1:
- print("Let's use ",torch.cuda.device_count(),"GPUs!")
- mymodel=nn.DataParallel(mymodel)
- mymodel.to(device)
5、train model
- loss_func=nn.CrossEntropyLoss()
- optimizer=optim.Adam(mymodel.parameters(),lr=0.0001)
- from sklearn.metrics import accuracy_score
- def flat_accuracy(preds,labels):
- pred_flat=np.argmax(preds,axis=1).flatten()
- labels_flat=labels.flatten()
- return accuracy_score(labels_flat,pred_flat)
- epochs=3
- for epoch in range(epochs):
- train_loss = 0.0
- train_acc=0.0
- for i,data in enumerate(train_loader):
- input_ids,attention_mask,labels=[elem.to(device) for elem in data]
- #优化器置零
- optimizer.zero_grad()
- #得到模型的结果
- out=mymodel(input_ids,attention_mask)
- #计算误差
- loss=loss_func(out,labels)
- train_loss += loss.item()
- #误差反向传播
- loss.backward()
- #更新模型参数
- optimizer.step()
- #计算acc
- out=out.detach().numpy()
- labels=labels.detach().numpy()
- train_acc+=flat_accuracy(out,labels)
- print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch,epochs,train_loss/(i+1),train_acc/(i+1)))
6、evaluate
- print("evaluate...")
- val_loss=0
- val_acc=0
- mymodel.eval()
- for j,batch in enumerate(val_loader):
- val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch]
- with torch.no_grad():
- pred=mymodel(val_input_ids,val_attention_mask)
- val_loss+=loss_func(pred,val_labels)
- pred=pred.detach().cpu().numpy()
- val_labels=val_labels.detach().cpu().numpy()
- val_acc+=flat_accuracy(pred,val_labels)
- print("evaluate loss:%d, Acc:%d" %(val_loss/len(val_loader),val_acc/len(val_loader)))