深度学习框架拾遗:【Pytorch(四)】——Pytorch文本数据建模流程

Step 1.数据准备

这里会用到torchtext包,常见API如下:

  • torchtext.data.Example : 用来表示一个样本,数据和标签
  • torchtext.vocab.Vocab: 词汇表,可以导入一些预训练词向量
  • torchtext.data.Datasets: 数据集类,__getitem__返回 Example实例, torchtext.data.TabularDataset是其子类。
  • torchtext.data.Field : 用来定义字段的处理方法(文本字段,标签字段)创建 Example时的 预处理,batch 时的一些处理操作。
  • torchtext.data.Iterator: 迭代器,用来生成 batch
  • torchtext.datasets: 包含了常见的数据集.
import numpy as np 
import pandas as pd 
from collections import OrderedDict
import re,string
MAX_WORDS = 10000  # 仅考虑最高频的10000个词
MAX_LEN = 200  # 每个样本保留200个词的长度
BATCH_SIZE = 20 
train_data_path = '../data/imdb/train.tsv'
test_data_path = '../data/imdb/test.tsv'
train_token_path = '../data/imdb/train_token.tsv'
test_token_path =  '../data/imdb/test_token.tsv'
train_samples_path = '../data/imdb/train_samples/'
test_samples_path =  '../data/imdb/test_samples/'

## 构建词典

word_count_dict = {}

# 清洗文本
def clean_text(text):
    lowercase = text.lower().replace("\n"," ")
    stripped_html = re.sub('<br />', ' ',lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation),'',stripped_html)
    return cleaned_punctuation

with open(train_data_path,"r",encoding = 'utf-8') as f:
    for line in f:
        label,text = line.split("\t")
        cleaned_text = clean_text(text)
        for word in cleaned_text.split(" "):
            word_count_dict[word] = word_count_dict.get(word,0)+1 

df_word_dict = pd.DataFrame(pd.Series(word_count_dict,name="count"))
df_word_dict = df_word_dict.sort_values( by ="count",ascending=False)
df_word_dict = df_word_dict[0:MAX_WORDS-2]
df_word_dict["word_id"] = range(2,MAX_WORDS)
word_id_dict = df_word_dict["word_id"].to_dict()
df_word_dict.head(10)

在这里插入图片描述

## 利用构建好的词典,将文本转换成token序号

## 转换token
def pad(data_list,pad_length):
    padded_list = data_list.copy()
    if len(data_list)> pad_length:
         padded_list = data_list[-pad_length:]
    if len(data_list)< pad_length:
         padded_list = [1]*(pad_length-len(data_list))+data_list
    return padded_list

def text_to_token(text_file,token_file):
    with open(text_file,"r",encoding = 'utf-8') as fin,\
      open(token_file,"w",encoding = 'utf-8') as fout:
        for line in fin:
            label,text = line.split("\t")
            cleaned_text = clean_text(text)
            word_token_list = [word_id_dict.get(word, 0) for word in cleaned_text.split(" ")]
            pad_list = pad(word_token_list,MAX_LEN)
            out_line = label+"\t"+" ".join([str(x) for x in pad_list])
            fout.write(out_line+"\n")

text_to_token(train_data_path,train_token_path)
text_to_token(test_data_path,test_token_path)

# 接着将token文本按照样本分割,每个文件存放一个样本的数据
import os

if not os.path.exists(train_samples_path):
    os.mkdir(train_sample_path)
    
if not os.path.exists(test_samples_path):
    os.mkdir(test_samples_path)
    
def split_samples(token_path,samples_dir):
    with open(token_path,"r",encoding='utf-8') as fin:
        i = 0
        for line in fin:
            with open(samples_dir+"%d.txt"%i,"w",encoding="utf-8") as fout:
                fout.write(line)
            i = i + 1

split_samples(train_token_path,train_samples_path)
split_samples(test_token_path,test_samples_path)

## 创建数据集Dataset

import os
from torch.utils.data import Dataset,DataLoader
class imdbDataset(Dataset):
    def __init__(self,samples_dir):
        self.samples_dir = samples_dir
        self.samples_paths = os.listdir(samples_dir)
        
    def __len__(self):
        return len(self.samples_paths)
    
    def __getitem__(self,index):
        path = self.samples_dir + self.samples_paths[index]
        with open(path,"r",encoding="utf-8") as f:
            line = f.readline()
            label,tokens = line.split("\t")
            label = torch.tensor([float(label)],dtype=torch.float)
            feature = torch.tensor([int(x) for x in tokens.split(" ")],dtype=torch.long)
            return (feature,label)

ds_train = imdbDataset(train_samples_path)
ds_test = imdbDataset(test_samples_path)

dl_train = DataLoader(ds_train,batch_size=BATCH_SIZE,shuffle=True,num_workers=4)
dl_test = DataLoader(ds_test,batch_size=BATCH_SIZE,num_workers=4)

for features,labels in dl_train:
    print(features)
    print(labels)
    break

在这里插入图片描述

Step 2.定义模型
import torch
from torch import nn
import torchkeras

torch.random.seed()

class Net(torchkeras.Model):
    def __init__(self):
        super(Net,self).__init__()
        
        ## 设置padding_idx参数后将在训练过程中将填充的token始终赋值为0向量
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS,embedding_dim=3,padding_idx=1)
        self.conv = nn.Sequential()
        self.conv.add_module("conv_1",nn.Conv1d(in_channels=3,out_channels=16,kernel_size=5))
        self.conv.add_module("pool_1",nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_1",nn.ReLU())
        self.conv.add_module("conv_2",nn.Conv1d(in_channels=16,out_channels=128,kernel_size=2))
        self.conv.add_module("pool_2",nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_2",nn.ReLU())
        
        self.dense = nn.Sequential()
        self.dense.add_module("flatten",nn.Flatten())
        self.dense.add_module("linear",nn.Linear(6144,1))
        self.dense.add_module("sigmoid",nn.Sigmoid())
        
    
    def forward(self,x):
        x = self.embedding(x).transpose(1,2)
        x = self.conv(x)
        y = self.dense(x)
        return y


model = Net()
print(model)
model.summary(input_shape=(200,),input_dtype=torch.LongTensor)

在这里插入图片描述

Step 3.训练模型
# 准确率
def accuracy(y_pred,y_true):
    y_pred = torch.where(y_pred>0.5,torch.ones_like(y_pred,dtype = torch.float32),
                      torch.zeros_like(y_pred,dtype = torch.float32))
    acc = torch.mean(1-torch.abs(y_true-y_pred))
    return acc

model.compile(loss_func=nn.BCELoss(),optimizer=torch.optim.Adagrad(model.parameters(),lr=0.02),
             metrics_dict={"accuracy":accuracy})

dfhistory = model.fit(20,dl_train,dl_val=dl_test,log_step_freq= 200)

在这里插入图片描述

Step 4.评估模型
import matplotlib.pyplot as plt

def plot_metric(dfhistory, metric):
    train_metrics = dfhistory[metric]
    val_metrics = dfhistory['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics, 'bo--')
    plt.plot(epochs, val_metrics, 'ro-')
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()

plot_metric(dfhistory,"loss")
plot_metric(dfhistory,"accuracy")

在这里插入图片描述

在这里插入图片描述

Step 5.保存/加载模型
torch.save(model.state_dict(), "../data/model_parameter.pkl")
model_clone = Net()
model_clone.load_state_dict(torch.load("../data/model_parameter.pkl"))

model_clone.compile(loss_func = nn.BCELoss(),optimizer= torch.optim.Adagrad(model.parameters(),lr = 0.02),
             metrics_dict={"accuracy":accuracy})
model_clone.evaluate(dl_test)

在这里插入图片描述

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值