2023-03-20干活

transformer复现

from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
import torch.nn as nn
import os
import time
import math
from tqdm import tqdm

def get_data(path,num=None):
    all_text = []
    all_label = []
    with open(path,"r",encoding="utf8") as f:
        all_data = f.read().split("\n")
    for data in all_data:
        try:
            if len(data) == 0:
                continue
            data_s = data.split("  ")
            if len(data_s) != 2:
                continue
            text,label = data_s
            label = int(label)

        except Exception as e:
            print(e)
        else:
            all_text.append(text)
            all_label.append(int(label))
    if num is None:
        return all_text,all_label
    else:
        return all_text[:num], all_label[:num]

def build_word2index(train_text):
    word_2_index =  {"PAD":0,"UNK":1}
    for text in train_text:
        for word in text:
            if word not in word_2_index:
                word_2_index[word] = len(word_2_index)

    return word_2_index


class TextDataset(Dataset):
    def __init__(self,all_text,all_lable):
        self.all_text = all_text
        self.all_lable = all_lable

    def __getitem__(self, index):
        global word_2_index
        text = self.all_text[index]
        text_index = [word_2_index.get(i,1) for i in text]
        label = self.all_lable[index]
        text_len = len(text)
        return text_index,label,text_len


    def process_batch_batch(self, data):
        global max_len,word_2_index,index_2_embeding
        batch_text = []
        batch_label = []
        batch_len = []

        for d in data:
            batch_text.append(d[0])
            batch_label.append(d[1])
            batch_len.append(d[2])
        min_len = min(batch_len)

        batch_text = [i[:max_len] for i in batch_text]


        batch_text = [i + [0]*(max_len-len(i)) for i in batch_text]
        # batch_emebdding = []
        # for text_idx in batch_text:
        #     text_embdding = []
        #     for idx in text_idx:
        #         word_emb = index_2_embeding[idx]
        #         text_embdding.append(word_emb)
        #     batch_emebdding.append(text_embdding)
        return torch.tensor(batch_text),torch.tensor(batch_label),batch_len


    def __len__(self):
        return len(self.all_text)

class Positional(nn.Module):
    def __init__(self,embedding_num,max_len = 3000):
        super().__init__()
        self.position = torch.zeros(size=(max_len,embedding_num),requires_grad=False) #  3000 * embedding

        t = torch.arange(1,max_len+1,dtype=torch.float).unsqueeze(1)
        w_i = 1/(10000**((torch.arange(0,embedding_num,2))/embedding_num))

        w_i_t = w_i*t

        self.position[:,::2] = torch.sin(w_i_t)
        self.position[:,1::2] = torch.cos(w_i_t)
        self.normal = nn.LayerNorm(normalized_shape=embedding_num)
        self.dropout = nn.Dropout(0.2)


    def forward(self,batch_x): # batch * len * 200
        pos = self.position[:batch_x.shape[1],:]
        pos = pos.unsqueeze(dim=0)
        pos = pos.to(batch_x.device)
        result = batch_x + pos
        result = self.normal(result)
        result = self.dropout(result)

        return result

class M_Self_Attention(nn.Module):
    def __init__(self,embedding_num,n_heads):
        super(M_Self_Attention, self).__init__()
        self.W_Q = nn.Linear(embedding_num,embedding_num,bias=False)
        self.W_K = nn.Linear(embedding_num,embedding_num,bias=False)
        # self.W_L = nn.Linear(embedding_num,max_len,bias=False)
        self.W_V = nn.Linear(embedding_num,embedding_num,bias=False)
        self.softmax = nn.Softmax(dim=-1)
        self.n_heads = n_heads

    def forward(self,x):
        b,l,n = x.shape
        x_ = x.reshape(b, self.n_heads, -1, n)
        Q = self.W_Q(x_) # 查询
        K = self.W_K(x_) # 关键

        V = self.W_V(x_) # 值

        # s = (Q@(K.transpose(-1,-2)) + L) / (math.sqrt(x.shape[-1]/1.0))
        s = (Q@(K.transpose(-1,-2)) ) / (math.sqrt(x.shape[-1]/1.0))
        score = self.softmax(s)
        r = score @ V
        r = r.reshape(b,l,n)
        return r

class Add_Norm(nn.Module):
    def __init__(self,embedding_num):
        super().__init__()
        self.Add = nn.Linear(embedding_num,embedding_num)
        self.Norm = nn.LayerNorm(embedding_num)

    def forward(self,x): # B * Layer * emb
        add_x = self.Add(x)
        norm_x = self.Norm(add_x)
        return norm_x


class Feed_Forward(nn.Module):
    def __init__(self,embedding_num,feed_num):
        super(Feed_Forward, self).__init__()
        self.l1 = nn.Linear(embedding_num,feed_num)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(feed_num,embedding_num)

    def forward(self,x):
        l1_x = self.l1(x)
        r_x = self.relu(l1_x)
        l2_x = self.l2(r_x)
        return l2_x

class Block(nn.Module):
    def __init__(self,embeding_dim,n_heads,feed_num):
        super(Block, self).__init__()
        self.att_layer = M_Self_Attention(embeding_dim, n_heads)
        self.add_norm1 = Add_Norm(embeding_dim)

        self.feed_forward = Feed_Forward(embeding_dim, feed_num)
        self.add_norm2 = Add_Norm(embeding_dim)
        self.n = 100

    def forward(self,x):
        att_x = self.att_layer(x)
        adn_x1 = self.add_norm1(att_x)

        adn_x1 = x + adn_x1  # 残差网络

        ff_x = self.feed_forward(adn_x1)
        adn_x2 = self.add_norm2(ff_x)



        adn_x2 = adn_x1 + adn_x2  # 残差网络

        return adn_x2

class TransformerEncoder(nn.Module):
    def __init__(self,word_size,embeding_dim,class_num,n_heads,feed_num,N):
        super().__init__()
        """
        1. 随机数表示字向量
        2. 预训练字向量 :  使用bert 字向量替换, 使用sougou字向量
        3. 自己基于train_text 训练字向量 
        """
        self.embedding = torch.nn.Embedding(word_size,embeding_dim)
        self.positional = Positional(embeding_dim)
        # 5W~18W 短文本数据

        # self.blocks = nn.ModuleList([Block(embedding_num,n_heads,feed_num)]*N)
        self.blocks = nn.Sequential(*[Block(embedding_num,n_heads,feed_num) for i in range(N)])


        self.linear1 = nn.Linear(embeding_dim,class_num)
        self.loss_fun = nn.CrossEntropyLoss()


    def forward(self,x,batch_len,label=None):
        x = self.embedding(x)
        x = self.positional(x)

        # mask_x = torch.ones(size=(*x.shape[:2],1),device=x.device)
        mask_x = torch.ones_like(x,device=x.device)

        for i in range(len(batch_len)):
            mask_x[i][batch_len[i]:] = 0

        x = mask_x * x

        x = self.blocks(x)


        pre = self.linear1.forward(x)
        pre = torch.mean(pre,dim=1)
        if label is not None:
            loss = self.loss_fun(pre,label)
            return loss
        else:
            return torch.argmax(pre,dim=-1)

def same_seeds(seed):
    torch.manual_seed(seed)  # 固定随机种子(CPU)
    if torch.cuda.is_available():  # 固定随机种子(GPU)
        torch.cuda.manual_seed(seed)  # 为当前GPU设置
        torch.cuda.manual_seed_all(seed)  # 为所有GPU设置
    np.random.seed(seed)  # 保证后续使用random函数时,产生固定的随机数
    torch.backends.cudnn.benchmark = False  # GPU、网络结构固定,可设置为True
    torch.backends.cudnn.deterministic = True  # 固定网络结构

# word2vec 复现

if __name__ == "__main__":
    same_seeds(1007)
    train_text,train_lable = get_data(os.path.join("..","data","文本分类","train.txt"),70000)
    dev_text,dev_lable = get_data(os.path.join("..","data","文本分类","dev.txt"),10000)
    assert len(train_lable) == len(train_text),"训练数据长度都不一样,你玩冒险呢?"
    assert len(dev_text) == len(dev_lable),"验证数据长度都不一样,你玩冒险呢?"

    embedding_num = 200
    word_2_index = build_word2index(train_text)

    train_batch_size = 50
    max_len = 30
    epoch = 10
    lr = 0.001
    n_heads = 2
    N = 2
    feed_num = int(embedding_num*1.2)
    class_num = len(set(train_lable))

    device = "cuda:0" if  torch.cuda.is_available() else "cpu"
    # device = "cpu"

    train_dataset = TextDataset(train_text,train_lable)
    train_dataloader = DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True,collate_fn=train_dataset.process_batch_batch)

    dev_dataset = TextDataset(dev_text, dev_lable)
    dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False,collate_fn=dev_dataset.process_batch_batch)


    model = TransformerEncoder(len(word_2_index),embedding_num,class_num,n_heads,feed_num,N).to(device)
    opt = torch.optim.Adam(model.parameters(),lr)
    s_time = time.time()
    for e in range(epoch):
        print("*" * 100)

        for bi,(batch_text,batch_label,batch_len) in (enumerate(train_dataloader,start=1)):

            batch_text = batch_text.to(device)
            batch_label = batch_label.to(device)
            loss = model.forward(batch_text,batch_len,batch_label)
            loss.backward()
            opt.step()
            opt.zero_grad()


        print(f"loss:{loss:.2f}")
        e_time = time.time()
        # print(f"cost time :{e_time - s_time:.2f}s")
        s_time = time.time()

        right_num = 0
        for bi,(batch_text,batch_label,batch_len) in (enumerate(dev_dataloader)):

            batch_text = batch_text.to(device)
            batch_label = batch_label.to(device)
            pre = model.forward(batch_text,batch_len)
            right_num += int(torch.sum(pre == batch_label))
        print(f"acc:{right_num/len(dev_dataset) * 100:.2f}%")
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值