Hung-Yi Lee homework[4]:RNN

(一)作业描述

  输入英文句子,输出0或1(如果句子是正面的,标1;如果句子是负面的,标0)。要求采用RNN。
  下载的数据中包含三个文件:training_label.txt,training_nolabel.txt,testing_data.txt。
  training_label.txt的样式如下,最开始的1/0是正负倾向;+++&+++是分隔符,不用在意;分隔符后是需要判断倾向的句子。
在这里插入图片描述
  training_molabel.txt的样式如下:
在这里插入图片描述
  training_molabel.txt中的数据在训练过程中不进行使用,但在进行词的向量化时进行使用。
  testing_data.txt的样式如下:
在这里插入图片描述
  testing_data.txt分为两列,第一列是句子的id,第二列是句子,两列用’,'进行分隔。

(二)实现过程

  代码结构:
代码结构

utils.py

  定义各种基本函数

import warnings
warnings.filterwarnings('ignore')
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

def load_training_data(path='training_label.txt'):
    # 把 training 时需要的 data 读进来
    # 如果是 'training_label.txt',需要读取 label,如果是 'training_nolabel.txt',不需要读取 label
    if 'training_label' in path:
        with open(path, 'rb') as f:
            lines = f.readlines()
            lines = [line.decode().strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    else:
        with open(path, 'rb') as f:
            lines = f.readlines()
            x = [line.decode().strip('\n').split(' ') for line in lines]
        return x

def load_testing_data(path='testing_data'):
    # 把 testing 时需要的 data 读进来
    with open(path, 'rb') as f:
        lines = f.readlines()
        X = ["".join(line.decode().strip('\n').split(",")[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X]
    return X

def evaluation(outputs, labels):
    # outputs => probability (float)
    # labels => labels
    outputs[outputs >= 0.5] = 1  # 大于等于 0.5 为正面
    outputs[outputs < 0.5] = 0  # 小于 0.5 为负面
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

word2vec.py

  词的向量化

import os
import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec
from utils import *


def train_word2vec(x):
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
    return model


if __name__ == "__main__":
    print("loading training data ...")
    train_x, y = load_training_data('training_label.txt')
    train_x_no_label = load_training_data('training_nolabel.txt')

    print("loading testing data ...")
    test_x = load_testing_data('testing_data.txt')

    model = train_word2vec(train_x + test_x)

    print("saving model ...")
    model.save(os.path.join('w2v_all.model'))

data_preprocess.py

  数据预处理

from torch import nn
from gensim.models import Word2Vec
import torch

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
    def get_w2v_model(self):
        # 把之前训练好的 word to vec 模型读进来
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
    def add_embedding(self, word):
        # 把 word 加进 embedding, 并赋予它一个随机生成的 representation vector
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
    def make_embedding(self, load=True):
        print("Get embedding ...")
        # 取得训练好的 Word2vec word embedding
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        # 制作一个 word2idx 的 dictionary
        # 制作一个 idx2word 的 list
        # 制作一个 word2vector 的 list
        for i, word in enumerate(self.embedding.wv.vocab):
            print('get words #{}'.format(i+1), end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        # 将 "<PAD>""<UNK>" 加进 embedding 里面
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    def pad_sequence(self, sentence):
        # 将每个句子变成一样的长度
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence
    def sentence_word2idx(self):
        # 把句子里面的字转成相对应的 index
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            # 将每个句子变成一样的长度
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # 把 labels 转成 tensor
        y = [int(label) for label in y]
        return torch.LongTensor(y)

dataset.py

  定义数据集结构。

import torch
from torch.utils import data


class TwitterDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)

    __len__ will return the number of data
    """

    def __init__(self, X, y):
        self.data = X
        self.label = y

    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]

    def __len__(self):
        return len(self.data)

model.py

  定义模型。

import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # 制作 embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # 是否将 embedding fix 住,如果 fix_embedding 为 False,在训练过程中,embedding 也会跟着被训练
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x dimension (batch, seq_len, hidden_size)
        # 取用 LSTM 最后一层的 hidden state
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

train.py

  定义训练过程如何进行参数更新及模型保存。

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from utils import *

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train()  # 将 model 的模式设为 train,这样 optimizer 就可以更新 model 的参数
    criterion = nn.BCELoss()  # 定义损失函数,binary cross entropy loss
    t_batch = len(train)
    v_batch = len(valid)
    optimizer = optim.Adam(model.parameters(), lr=lr)  # 将模型的参数給 optimizer,并给予适当的 learning rate
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        # 开始 training
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long)  # device 为 "cuda",将 inputs 转成 torch.cuda.LongTensor
            labels = labels.to(device, dtype=torch.float)  # device为 "cuda",将 labels 转成 torch.cuda.FloatTensor,因为等下要喂进 criterion,所以类型要是 float
            optimizer.zero_grad()  # 由于 loss.backward() 的 gradient 会累加,所以每次喂完一個 batch 后需要归零
            outputs = model(inputs)  # 将 input 喂給模型
            outputs = outputs.squeeze()  # 去掉最外面的 dimension,好让 outputs 可以喂进 criterion()
            loss = criterion(outputs, labels)  # 计算此时模型的 training loss
            loss.backward()  # 算 loss 的 gradient
            optimizer.step()  # 更新
            correct = evaluation(outputs, labels)  # 计算此时模型的 training accuracy
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # 这段做 validation
        model.eval()  # 将 model 的模式设为 eval,这样 model 的参数就会被固定
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long)  # device 为 "cuda",将 inputs 转成 torch.cuda.LongTensor
                labels = labels.to(device, dtype=torch.float)  # device 为 "cuda",将 labels 转成 torch.cuda.FloatTensor,因为等下要喂进 criterion,所以类型要是 float
                outputs = model(inputs)  # 将 input 喂給模型
                outputs = outputs.squeeze()  # 去掉最外面的 dimension,好让 outputs 可以喂进 criterion()
                loss = criterion(outputs, labels)  # 计算此时模型的 validation loss
                correct = evaluation(outputs, labels)  # 计算此时模型的 validation accuracy
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                # 如果 validation 的结果大于之前所有的結果,就把现在的模型存下來以备之后做预测时使用
                best_acc = total_acc
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')
        model.train()

train_main.py

  正式开始训练,模型训练的完整流程,包括数据预处理等。

import os
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from utils import *
from data_preprocess import *
from model import *
from dataset import *
from train import *

if __name__ == "__main__":
    # 判断是否可以使用GPU, 如果可以的话 device 就设为 "cuda",不行的话设为 "cpu"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # 处理好各个 data 的路径
    train_with_label = 'training_label.txt'
    train_no_label = 'training_nolabel.txt'
    testing_data = 'testing_data.txt'

    w2v_path = 'w2v_all.model'

    # 定义句子长度、要不要固定 embedding、batch 大小、要训练几个 epoch、learning rate 的值、model 的路径
    sen_len = 20
    fix_embedding = True
    batch_size = 32
    epoch = 5
    lr = 0.001
    # model_dir = os.path.join(path_prefix, 'model/') # model directory for checkpoint model

    print("loading data ...") # 把 'training_label.txt''training_nolabel.txt' 讀進來
    train_x, y = load_training_data(train_with_label)
    train_x_no_label = load_training_data(train_no_label)

    # 对 input 跟 labels 做预处理
    preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
    embedding = preprocess.make_embedding(load=True)
    train_x = preprocess.sentence_word2idx()
    y = preprocess.labels_to_tensor(y)

    # 制作一个 model 的对象
    model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
    model = model.to(device)

    # 把 data 分为 training data 跟 validation data
    X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]

    # 制作成dataset格式
    train_dataset = TwitterDataset(X=X_train, y=y_train)
    val_dataset = TwitterDataset(X=X_val, y=y_val)

    # 把 data 转成 batch of tensors
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                num_workers=2)

    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                                batch_size=batch_size,
                                                shuffle=False,
                                                num_workers=2)

    # 开始训练
    training(batch_size, epoch, lr, 'model_dir', train_loader, val_loader, model, device)

test.py

  定义测试过程。

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F


def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs >= 0.5] = 1  # 大于等于 0.5 为正
            outputs[outputs < 0.5] = 0  # 小于 0.5 为负
            ret_output += outputs.int().tolist()

    return ret_output

predict.py

  对测试集进行预测并将结果保存到predict.csv中。

import os
import pandas as pd
from data_preprocess import *
from dataset import *
from utils import *
from test import *

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    testing_data = 'testing_data.txt'

    w2v_path = 'w2v_all.model'
    sen_len = 20
    batch_size = 32
    model_dir = 'model_dir'

    print("loading testing data ...")
    test_x = load_testing_data(testing_data)
    preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
    embedding = preprocess.make_embedding(load=True)
    test_x = preprocess.sentence_word2idx()
    test_dataset = TwitterDataset(X=test_x, y=None)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                batch_size=batch_size,
                                                shuffle=False,
                                                num_workers=3)
    print('\nload model ...')
    model = torch.load(os.path.join(model_dir, 'ckpt.model'))
    outputs = testing(batch_size, test_loader, model, device)

    tmp = pd.DataFrame({"id": [str(i) for i in range(len(test_x))], "label":outputs})
    print("save csv ...")
    tmp.to_csv('predict.csv', index=False)
    print("Finish Predicting")

  predict.csv的样式如下:
在这里插入图片描述
  predict.csv分为两列,第一列是用来判断倾向的句子id,第二列是预测的句子正负。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值