直接上代码:
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import argparse
from gensim.models import word2vec
import os
from torch import nn
from torch.utils import data
from sklearn.model_selection import train_test_split
def load_training_data(path='training_label.txt'):
if 'training_label' in path:
with open(path, 'r',encoding='UTF-8') as f:
lines = f.readlines()
lines = [line.strip('\n').split(' ') for line in lines]
x = [line[2:] for line in lines]
y = [line[0] for line in lines]
return x, y
else:
with open(path, 'r',encoding='UTF-8') as f:
lines = f.readlines()
x = [line.strip('\n').split(' ') for line in lines]
return x
def load_testing_data(path='testing_data'):
with open(path, 'r',encoding='UTF-8') as f:
lines = f.readlines()
X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]]
X = [sen.split(' ') for sen in X]
return X
def load_no_label(path_x,path_y):
y=[]
x=[]
Index=[]
with open(path_y, 'r',encoding='UTF-8') as f:
lines = f.readlines()
for line in lines[1:]:
sen = line.split(",")
if(sen[1][0]=='0' and sen[1][2]=='0'):
y.append('0')
Index.append(sen[0])
elif(sen[1][0]=='1'):
y.append('1')
Index.append(sen[0])
with open(path_x, 'r',encoding='UTF-8') as f:
lines = f.readlines()
lines = [line.strip('\n').split(' ') for line in lines]
for index in Index:
count = 0
for n in index:
count = int(n)+count*10
x.append(lines[count])
return x ,y
def evaluation(outputs, labels):
outputs[outputs>=0.5] = 1
outputs[outputs<0.5] = 0
correct = torch.sum(torch.eq(outputs, labels)).item()
return correct
def train_word2vec(x):
model = word2vec.Word2Vec(x, size=256, window=15, min_count=5, workers=12, iter=10, sg=1)
return model
class Preprocess():
def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
self.w2v_path = w2v_path
self.sentences = sentences
self.sen_len = sen_len
self.idx2word = []
self.word2idx = {}
self.embedding_matrix = []
def get_w2v_model(self):
self.embedding = word2vec.Word2Vec.load(self.w2v_path)
self.embedding_dim = self.embedding.vector_size
print(f"embedding_dim is {self.embedding_dim}")
def add_embedding(self, word):
vector = torch.empty(1, self.embedding_dim)
torch.nn.init.uniform_(vector)
self.word2idx[word] = len(self.word2idx)
self.idx2word.append(word)
self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
def make_embedding(self, load=True):
print("Get embedding ...")
if load:
print("loading word to vec model ...")
self.get_w2v_model()
else:
raise NotImplementedError
for i, word in enumerate(self.embedding.wv.vocab):
print('get words #{}'.format(i+1), end='\r')
self.word2idx[word] = len(self.word2idx)
self.idx2word.append(word)
self.embedding_matrix.append(self.embedding.wv[word])
print(' ')
self.embedding_matrix = torch.tensor(self.embedding_matrix)
self.add_embedding("<PAD>")
self.add_embedding("<UNK>")
print("total words: {}".format(len(self.embedding_matrix)))
return self.embedding_matrix
def pad_sequence(self, sentence):
if len(sentence) > self.sen_len:
sentence = sentence[:self.sen_len]
else:
pad_len = self.sen_len - len(sentence)
for _ in range(pad_len):
sentence.append(self.word2idx["<PAD>"])
assert len(sentence) == self.sen_len
return sentence
def sentence_word2idx(self):
sentence_list = []
for i, sen in enumerate(self.sentences):
print('sentence count #{}'.format(i+1), end='\r')
sentence_idx = []
for word in sen:
if (word in self.word2idx.keys()):
sentence_idx.append(self.word2idx[word])
else:
sentence_idx.append(self.word2idx["<UNK>"])
sentence_idx = self.pad_sequence(sentence_idx)
sentence_list.append(sentence_idx)
return torch.LongTensor(sentence_list)
def labels_to_tensor(self, y):
y = [int(label) for label in y]
return torch.LongTensor(y)
class TwitterDataset(data.Dataset):
"""
Expected data shape like:(data_num, data_len)
Data can be a list of numpy array or a list of lists
input data shape : (data_num, seq_len, feature_dim)
__len__ will return the number of data
"""
def __init__(self, X, y):
self.data = X
self.label = y
def __getitem__(self, idx):
if self.label is None: return self.data[idx]
return self.data[idx], self.label[idx]
def __len__(self):
return len(self.data)
def load_train_val(train_x,y,sen_len,path_prefix,w2v_path,if_add_nolable=False):
if(if_add_nolable):
train_x_no_label,train_y_no_label = load_no_label(os.path.join(path_prefix,'training_nolabel.txt'),
os.path.join(path_prefix,'predict_nolable.txt'))
preprocess = Preprocess(train_x+train_x_no_label, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
z = y+train_y_no_label
z = preprocess.labels_to_tensor(z)
X_train = torch.cat((train_x[:180000],train_x[200000:]),dim=0)
X_val = train_x[180000:200000]
y_train = torch.cat((z[:180000],z[200000:]),dim=0)
y_val = z[180000:200000]
else:
preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)
X_train = train_x[:180000]
X_val = train_x[180000:]
y_train = y[:180000]
y_val = y[180000:]
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
batch_size = batch_size,
shuffle = True,
num_workers = 0)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
batch_size = batch_size,
shuffle = False,
num_workers = 0)
return train_loader,val_loader,embedding
class LSTM_Net(nn.Module):
def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
super(LSTM_Net, self).__init__()
self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
self.embedding.weight = torch.nn.Parameter(embedding)
self.embedding.weight.requires_grad = False if fix_embedding else True
self.embedding_dim = embedding.size(1)
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=False)
for name, param in self.lstm.named_parameters():
if name.startswith("weight"):
nn.init.orthogonal_(param, gain=1)
else:
nn.init.zeros_(param)
self.classifier = nn.Sequential( nn.Dropout(dropout),
nn.Linear(hidden_dim, 1),
nn.Sigmoid() )
def forward(self, inputs):
inputs = self.embedding(inputs)
x, _ = self.lstm(inputs, None)
x = x[:, -1, :]
x = self.classifier(x)
return x
def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
criterion = nn.BCELoss()
t_batch = len(train)
v_batch = len(valid)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
total_loss, total_acc, best_acc = 0, 0, 0
for epoch in range(n_epoch):
total_loss, total_acc = 0, 0
model.train()
for i, (inputs, labels) in enumerate(train):
inputs = inputs.to(device, dtype=torch.long)
labels = labels.to(device, dtype=torch.float)
optimizer.zero_grad()
outputs = model(inputs)
outputs = outputs.squeeze()
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
correct = evaluation(outputs, labels)
total_acc += (correct / batch_size)
total_loss += loss.item()
print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))
model.eval()
with torch.no_grad():
total_loss, total_acc = 0, 0
for i, (inputs, labels) in enumerate(valid):
inputs = inputs.to(device, dtype=torch.long)
labels = labels.to(device, dtype=torch.float)
outputs = model(inputs)
outputs = outputs.squeeze()
loss = criterion(outputs, labels)
correct = evaluation(outputs, labels)
total_acc += (correct / batch_size)
total_loss += loss.item()
print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
if total_acc > best_acc:
best_acc = total_acc
torch.save(model, "{}/ckpt.model".format(model_dir))
print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
print('-----------------------------------------------')
def testing(batch_size, test_loader, model, device):
model.eval()
ret_output = []
with torch.no_grad():
for i, inputs in enumerate(test_loader):
inputs = inputs.to(device, dtype=torch.long)
outputs = model(inputs)
outputs = outputs.squeeze()
outputs[outputs>=0.5] = 1
outputs[outputs<0.5] = 0
ret_output += outputs.int().tolist()
return ret_output
def prepare(test_x,sen_len,batch_size,w2v_path):
print("loading data ...")
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
batch_size = batch_size,
shuffle = False,
num_workers = 0)
return test_loader
def testing_nolable(batch_size, test_loader, model, device):
model.eval()
ret_output = []
with torch.no_grad():
for i, inputs in enumerate(test_loader):
inputs = inputs.to(device, dtype=torch.long)
outputs = model(inputs)
outputs = outputs.squeeze()
outputs[outputs>0.9] = 1
outputs[outputs<0.1] = 0
ret_output += outputs.tolist()
return ret_output
path_prefix = 'C:\\Users\\13554\\jupyter practice\\lihongyi\\hw4'
print("loading training data ...")
train_x, y = load_training_data(os.path.join(path_prefix,'training_label.txt'))
train_x_no_label = load_training_data(os.path.join(path_prefix,'training_nolabel.txt'))
test_x = load_testing_data(os.path.join(path_prefix, 'testing_data.txt'))
model = train_word2vec(train_x + test_x + train_x_no_label)
print("saving model ...")
model.save(os.path.join(path_prefix, 'w2v_all.model'))
w2v_path = os.path.join(path_prefix, 'w2v_all.model')
if torch.cuda.is_available():
print('torch.cuda.is_available')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = path_prefix
sen_len = 20
fix_embedding = True
batch_size = 256
epoch = 15
lr = 0.001
train_loader,val_loader ,embedding = load_train_val(train_x, y ,sen_len,path_prefix,w2v_path,if_add_nolable=False)
model2 = LSTM_Net(embedding, embedding_dim=256, hidden_dim=128, num_layers=4, dropout=0.5, fix_embedding=True)
model2 = model2.to(device)
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model2, device)
test_loader = prepare(train_x_no_label,sen_len,batch_size,w2v_path)
epoch=5
model3 = torch.load(os.path.join(path_prefix,'ckpt.model'))
outputs = testing_nolable(batch_size, test_loader, model3, device)
tmp = pd.DataFrame({"id":[str(i) for i in range(len(train_x_no_label))],"label":outputs})
print("save txt ...")
tmp.to_csv(os.path.join(path_prefix, 'predict_nolable.txt'), index=False)
print("Finish Predicting")
train_loader,val_loader ,embedding = load_train_val(train_x, y ,sen_len,path_prefix,w2v_path,if_add_nolable=True)
model2 = LSTM_Net(embedding, embedding_dim=256, hidden_dim=128, num_layers=1, dropout=0.5, fix_embedding=True)
model2 = model2.to(device)
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model2, device)