ernie做分类模型
import torch
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 5
learning_rate = 5e-6
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import BertTokenizer
from tqdm import tqdm
class textDataset(Dataset):
def __init__(self, df):
self.tokenizer = BertTokenizer.from_pretrained('./ernie-base')
self.input_ids = []
self.token_type_ids = []
self.attention_mask = []
self.label_id = []
self.load_data(df)
def load_data(self, df):
for index, line in df.iterrows():
text = line['text']
label = line['flag']
token = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=128)
self.input_ids.append(np.array(token['input_ids']))
self.token_type_ids.append(np.array(token['token_type_ids']))
self.attention_mask.append(np.array(token['attention_mask']))
self.label_id.append(label)
def __getitem__(self, index):
return self.input_ids[index], self.token_type_ids[index], self.attention_mask[index], self.label_id[index]
def __len__(self):
return len(self.input_ids)
import torch
import torch.nn as nn
from transformers import BertModel
class ErnieClassifier(nn.Module):
def __init__(self, bert_config, num_labels):
super().__init__()
self.bert = BertModel(config=bert_config)
self.classifier = nn.Linear(bert_config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask, token_type_ids):
bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
pooled = bert_output[1]
logits = self.classifier(pooled)
return torch.softmax(logits, dim=1)
class ErineLstmClassifier(nn.Module):
def __init__(self, bert_config, num_labels):
super().__init__()
self.bert = BertModel(config=bert_config)
self.lstm = nn.LSTM(input_size=bert_config.hidden_size, hidden_size=bert_config.hidden_size, num_layers=2, batch_first=True, bidirectional=True)
self.classifier = nn.Linear(bert_config.hidden_size*2, num_labels)
self.softmax = nn.Softmax(dim=1)
def forward(self, input_ids, attention_mask, token_type_ids):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
last_hidden_state = outputs.last_hidden_state
out, _ = self.lstm(last_hidden_state)
logits = self.classifier(out[:, -1, :])
return self.softmax(logits)
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class Config(object):
"""配置参数"""
def __init__(self, embedding):
self.model_name = 'TextCNN'
self.train_path = './data/train_data'
self.test_path = './data/test_data'
self.vocab_path = './data/vocab.pkl'
self.log_path = './log/' + self.model_name
self.embedding_pretrained = torch.tensor(
np.load('./data/' + embedding)["embeddings"].astype('float32'))\
if embedding != 'random' else None
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.dropout = 0.5
self.require_improvement = 1000
self.num_classes = 2
self.n_vocab = 0
self.num_epochs = 10
self.batch_size = 128
self.pad_size = 32
self.learning_rate = 1e-3
self.embed = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 300
self.filter_sizes = (2, 3, 4)
self.num_filters = 256
'''Convolutional Neural Networks for Sentence Classification'''
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.convs = nn.ModuleList(
[nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
self.dropout = nn.Dropout(config.dropout)
self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self, x):
out = self.embedding(x[0])
out = out.unsqueeze(1)
out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
out = self.dropout(out)
out = self.fc(out)
return out
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
def init_network(model, method='xavier', exclude='embedding', seed=123):
for name, w in model.named_parameters():
if exclude not in name:
if 'weight' in name:
if method == 'xavier':
nn.init.xavier_normal_(w)
elif method == 'kaiming':
nn.init.kaiming_normal_(w)
else:
nn.init.normal_(w)
elif 'bias' in name:
nn.init.constant_(w, 0)
else:
pass
def train(config, model, train_iter, dev_iter, test_iter):
start_time = time.time()
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
criterion = nn.CrossEntropyLoss()
for epoch in range(config.num_epochs):
losses = 0
accuracy = 0
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = criterion(outputs, labels)
losses += loss.item()
predic = torch.max(outputs.data, 1)[1]
train_acc = metrics.accuracy_score(labels.data.cpu().numpy(), predic.cpu().numpy())
accuracy += train_acc
loss.backward()
optimizer.step()
average_loss = losses / len(train_iter)
average_acc = accuracy / len(train_iter)
print('\tTrain ACC:', average_acc, '\tLoss:', average_loss)
test(config, model, test_iter)
def test(config, model, test_iter):
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
def evaluate(config, model, data_iter, test=False):
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data_iter:
outputs = model(texts)
loss = F.cross_entropy(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
if test:
report = metrics.classification_report(labels_all, predict_all)
confusion = metrics.confusion_matrix(labels_all, predict_all)
return acc, loss_total / len(data_iter), report, confusion
return acc, loss_total / len(data_iter)
import os
import torch
import torch.nn as nn
from transformers import BertTokenizer, AdamW, BertConfig
from torch.utils.data import DataLoader
from model import ErineLstmClassifier,ErnieClassifier
from dataset import textDataset
from tqdm import tqdm
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import argparse
from config import *
def main():
train_df = pd.read_csv('./data/train_data', delimiter='\t').dropna()
test_df = pd.read_csv('./data/test_data', delimiter='\t').dropna()
print(train_df.head())
train_dataset = textDataset(train_df)
valid_dataset = textDataset(test_df)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
bert_config = BertConfig.from_pretrained('./ernie-base')
num_labels = len(set(train_dataset.label_id))
print(num_labels)
model = ErineLstmClassifier(bert_config, num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
class_weight = torch.tensor([10.0, 1.0]).to(device)
criterion = nn.CrossEntropyLoss()
best_f1 = 0
for epoch in range(1, epochs+1):
losses = 0
accuracy = 0
model.train()
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_bar = tqdm(train_dataloader, ncols=100)
for input_ids, token_type_ids, attention_mask, label_id in train_bar:
model.zero_grad()
train_bar.set_description('Epoch %i train' % epoch)
output = model(
input_ids=input_ids.to(device),
attention_mask=attention_mask.to(device),
token_type_ids=token_type_ids.to(device),
)
loss = criterion(output, label_id.to(device))
losses += loss.item()
pred_labels = torch.argmax(output, dim=1)
acc = torch.sum(pred_labels == label_id.to(device)).item() / len(pred_labels)
accuracy += acc
loss.backward()
optimizer.step()
train_bar.set_postfix(loss=loss.item(), acc=acc)
average_loss = losses / len(train_dataloader)
average_acc = accuracy / len(train_dataloader)
print('\tTrain ACC:', average_acc, '\tLoss:', average_loss)
model.eval()
losses = 0
pred_labels = []
true_labels = []
valid_bar = tqdm(valid_dataloader, ncols=100)
for input_ids, token_type_ids, attention_mask, label_id in valid_bar:
valid_bar.set_description('Epoch %i valid' % epoch)
output = model(
input_ids=input_ids.to(device),
attention_mask=attention_mask.to(device),
token_type_ids=token_type_ids.to(device),
)
loss = criterion(output, label_id.to(device))
losses += loss.item()
pred_label = torch.argmax(output, dim=1)
acc = torch.sum(pred_label == label_id.to(device)).item() / len(pred_label)
valid_bar.set_postfix(loss=loss.item(), acc=acc)
pred_labels.extend(pred_label.cpu().numpy().tolist())
true_labels.extend(label_id.numpy().tolist())
average_loss = losses / len(valid_dataloader)
print('\tLoss:', average_loss)
report = metrics.classification_report(true_labels, pred_labels)
print('* Classification Report:')
print(report)
f1 = metrics.f1_score(true_labels, pred_labels, average='micro')
if not os.path.exists('models'):
os.makedirs('models')
if f1 > best_f1:
best_f1 = f1
torch.save(model.state_dict(), 'best_model.pkl')
if __name__ == '__main__':
main()
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta
MAX_VOCAB_SIZE = 10000
UNK, PAD = '<UNK>', '<PAD>'
def build_vocab(file_path, tokenizer, max_size, min_freq):
vocab_dic = {}
with open(file_path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content = lin.split('\t')[0]
for word in tokenizer(content):
vocab_dic[word] = vocab_dic.get(word, 0) + 1
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
return vocab_dic
def build_dataset(config, ues_word):
if ues_word:
tokenizer = lambda x: x.split(' ')
else:
tokenizer = lambda x: [y for y in x]
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open(config.vocab_path, 'wb'))
print(f"Vocab size: {len(vocab)}")
def load_dataset(path, pad_size=32):
contents = []
with open(path, 'r', encoding='UTF-8') as f:
for index, line in enumerate(tqdm(f)):
lin = line.strip()
if not lin:
continue
if index == 0 or len(lin.split('\t')) != 2:
continue
content, label = lin.split('\t')
words_line = []
token = tokenizer(content)
seq_len = len(token)
if pad_size:
if len(token) < pad_size:
token.extend([PAD] * (pad_size - len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
contents.append((words_line, int(label), seq_len))
return contents
train = load_dataset(config.train_path, config.pad_size)
test = load_dataset(config.test_path, config.pad_size)
return vocab, train, test
class DatasetIterater(object):
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches
self.n_batches = len(batches) // batch_size
self.residue = False
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
def _to_tensor(self, datas):
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
return (x, seq_len), y
def __next__(self):
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches
elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
def __iter__(self):
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
def build_iterator(dataset, config):
iter = DatasetIterater(dataset, config.batch_size, config.device)
return iter
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
if __name__ == "__main__":
'''提取预训练词向量'''
train_dir = "./THUCNews/data/train.txt"
vocab_dir = "./THUCNews/data/vocab.pkl"
pretrain_dir = "./THUCNews/data/sgns.sogou.char"
emb_dim = 300
filename_trimmed_dir = "./THUCNews/data/embedding_SougouNews"
if os.path.exists(vocab_dir):
word_to_id = pkl.load(open(vocab_dir, 'rb'))
else:
tokenizer = lambda x: [y for y in x]
word_to_id = build_vocab(train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(word_to_id, open(vocab_dir, 'wb'))
embeddings = np.random.rand(len(word_to_id), emb_dim)
f = open(pretrain_dir, "r", encoding='UTF-8')
for i, line in enumerate(f.readlines()):
lin = line.strip().split(" ")
if lin[0] in word_to_id:
idx = word_to_id[lin[0]]
emb = [float(x) for x in lin[1:301]]
embeddings[idx] = np.asarray(emb, dtype='float32')
f.close()
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)