使用RNN/LSTM/GRU实现中文分词

一. 要求

1.1 环境

        python pytorch numpy

1.2 目的

        分别使用 RNN/LSTM/GRU实现中文自动分词

二. 代码编写

2.1设置字典

        利用jieba对数据集进行分词。得到相关的数据集后,需要生成相应的字典。每个汉字对应字典中的一个数,每个词性也对应另一个字典中的某个数。考虑到每个词长短不同,如“笔”,“笔尖”二者都是名词,但长度不同,这可能对后续操作由一定影响。因此,分别这样记录他们的词性:“n”,"n $"。其中"$"代表该位置的汉字与前一个共同组成名词。代码如下:

import numpy as np


def Seperate_ChineseCharacter_And_PartOfSpeech(txtpath,ChineseCharacterPath,PartOfSpeechPath,char_dict= {'。' : 0,'!':1,'?':2,'”':3},char_order= [],char_dict2= {'$' : 0},char_order2= []):#分成俩txt
    with open(txtpath, 'r') as f:
        lines = f.readlines()
    with open(ChineseCharacterPath, 'w') as f1, open(PartOfSpeechPath, 'w') as f2:
        for line in lines:
            parts = line.strip().split('  /')
            if len(parts) == 2:
                lis=ChineseCharacter2num(parts[0],char_dict,char_order)
                for item in lis:
                    f1.write(str(item))
                    f1.write(',')
                lenth=len(parts[0])
                f2.write(str(PartOfSpeech2num(parts[1],char_dict2,char_order2)))
                f2.write(',')
                if(lenth>1):
                    for item in range(lenth-1):
                        f2.write('0')
                        f2.write(',')
    to_delete(ChineseCharacterPath)
    to_delete(PartOfSpeechPath)
    return char_dict,char_dict2


def ChineseCharacter2num(text,char_dict= {'。' : 0,'!':1,'?':2},char_order= []):#将字符转化为数字
    result = [] 
    for char in text:
        if char not in char_dict:
            char_dict[char] = len(char_dict)  
            char_order.append(char)  
        result.append(char_dict[char])  
    return result

def PartOfSpeech2num(text,char_dict= {'$' : 0 },char_order= []):#词性转化为数字
    if text not in char_dict:
        char_dict[text] = len(char_dict)  
        char_order.append(text)  
    result=char_dict[text]
    return result

def to_delete(path):
    with open(path, 'r') as f:
        content = f.read()

    if content[-1] == ',':
        content = content[:-1]

    with open(path, 'w') as f:
        f.write(content)

        这部分代码被卸载init_operations.pyw里。

2.2 设置网络

        这里分别设置了三种网络对应的类。LSTM不需要输入H0,另外两种网络需要输入H0。引入embedding层是因为1,2,3,4……这种编码不能体现词性或者汉字间的相近或相斥的关系。引入后,学习效果会好很多。

        除了三种网络,我还定义了一个小网络,叫做coding。中午给我网络对词性进行了编码(而RNN网络的embedding是对汉字编码)。加入这个网络,是因为我主观的认为,对词性编码能更好的学习。当然,这个网络embedding后千万不能加linear,详见我之前的博客。

        代码如下(functions 是我自己定义的模块):

#coding:gb2312
import torch
from functions import*
from torch import nn
class RnnNet(nn.Module):
    def __init__(self, vector_lenth, hidden_size, output_size,numlayers,text):
        super(RnnNet, self).__init__()
        self.embedding = nn.Embedding(int(text.max())+10, vector_lenth).float()
        self.rnn = nn.RNN(
            input_size=vector_lenth,
            hidden_size=hidden_size,
            num_layers=numlayers,
            batch_first=True,
            bidirectional=False
        )
        for p in self.rnn.parameters():
          nn.init.normal_(p, mean=0.0, std=0.001)
        self.hidden_size=hidden_size
        self.linear1 = nn.Linear(hidden_size, 8)
        self.linear2 = nn.Linear(8, output_size)

    def forward(self, x, hidden_init):
        x=self.embedding(x)
        out, hidden_prev = self.rnn(x, hidden_init)

        out = out.view(-1, self.hidden_size)
        out = self.linear1(out)
        
        out=self.linear2(out)

        return out.float(), hidden_prev


class coding(nn.Module):
    def __init__(self, vector_lenth):
        super(coding, self).__init__()
        self.embedding = nn.Embedding(52, vector_lenth).float()
    def forward(self,y):
        y= self.embedding(y)
        return y.float()

class LSTMNet(nn.Module):
    def __init__(self, vector_lenth, hidden_size, output_size,numlayers,text):
        super(LSTMNet, self).__init__()
        self.embedding = nn.Embedding(int(text.max())+10, vector_lenth).float()
        self.rnn = nn.LSTM(
            input_size=vector_lenth,
            hidden_size=hidden_size,
            num_layers=numlayers,
            batch_first=True,
            bidirectional=False
        )
        for p in self.rnn.parameters():
          nn.init.normal_(p, mean=0.0, std=0.001)
        self.hidden_size=hidden_size
        self.linear1 = nn.Linear(hidden_size, 8)
        self.relu=nn.ELU()
        self.linear2 = nn.Linear(8, output_size)
        self.sgm=nn.Sigmoid()

    def forward(self, x):
        x=self.embedding(x)
        out, (h,c) = self.rnn(x)
        
        out = out.view(-1, self.hidden_size)
        out = self.linear1(out)

        out=self.linear2(out)

        return out.float()


class GRUNet(nn.Module):
    def __init__(self, vector_lenth, hidden_size, output_size,numlayers,text):
        super(GRUNet, self).__init__()
        self.embedding = nn.Embedding(int(text.max())+10, vector_lenth).float()
        self.rnn = nn.GRU(
            input_size=vector_lenth,
            hidden_size=hidden_size,
            num_layers=numlayers,
            batch_first=True,
            bidirectional=False
        )
        for p in self.rnn.parameters():
          nn.init.normal_(p, mean=0.0, std=0.001)
        self.hidden_size=hidden_size
        self.linear1 = nn.Linear(hidden_size, 8)
        self.relu=nn.ELU()
        self.linear2 = nn.Linear(8, output_size)
        self.sgm=nn.Sigmoid()

    def forward(self, x, hidden_init):
        x=self.embedding(x)
        
        out, hidden_prev = self.rnn(x, hidden_init)
        # [b, seq, h]
        
        out = out.view(-1, self.hidden_size)
        out = self.linear1(out)
        out=self.linear2(out)
        return out.float(), hidden_prev

        这部分代码被写在MyNet.pyw里

2.3 训练模块

        训练时。截取每句话,扔到GPU/CPU上训练。

        我在写代码时有一处写的不规范:判断是否结束时没有用len(数据集)而是直接将数据集的长度填了进去。

        代码如下:

#coding:gb2312
from platform import java_ver
import numpy as np
import torch
from functions import*
from torch import nn
import matplotlib.pyplot as plt
import init_operations
import random
import time
import MyNet

def train_model(seqs,y_true,model,vector_lenth, hidden_size, output_size,CC_dict,POS_dict,numlayers=1,epochs=1,learning_rate=0.001):
    #.cuda()
    coder=MyNet.coding(output_size).cuda()
    #.cuda()
    net = model(vector_lenth, hidden_size, output_size,numlayers,seqs).cuda()
    LossFunction=nn.MSELoss()#定义损失函数
    optimizer = torch.optim.Adam(net.parameters(), learning_rate)#选择优化器。建议adam
    hidden_init = torch.zeros( numlayers, hidden_size)#层数
    #.cuda()
    hidden_prev=hidden_init.cuda()
    LossRecoder=[]
    for epoch in range(epochs):
        start=0
        end=0
        L=0
        n=0
        while(end<133759):#此处修改
            end=start+1
            while(seqs[end]!=0 and seqs[end]!=1 and seqs[end]!=2):
                end+=1
            if(end<133759):#此处修改
                if(seqs[end+1]==3):
                    end+=1
            if(model!=MyNet.LSTMNet):
                #.cuda()
                output, hidden_prev = net(torch.LongTensor(seqs[start:end+1]).cuda(),hidden_init.cuda())
            elif(model==MyNet.LSTMNet):
                #.cuda()
                output= net(torch.LongTensor(seqs[start:end+1]).cuda())
            #.cuda()
            loss = LossFunction(output, coder(torch.LongTensor(y_true[start:end+1]).reshape(-1,1).cuda()).reshape(-1,output_size))
            #.cuda()
            temp=torch.LongTensor(y_true[start:end+1]).reshape(-1,1).float().cuda().detach()
            print('training......,epoch[',epoch+1,'/',epochs,']','sequence:',n,'loss=',loss.item())
            L+=loss.item()
            n+=1
            net.zero_grad()
            loss.backward()
            optimizer.step()
            start=end+1

        if(model!=MyNet.LSTMNet):
            pass
            #.cuda()
            #test('骆驼祥子拿着一台苹果手机。',net,hidden_init.cuda(),CC_dict,coder,POS_dict)
        elif(model==MyNet.LSTMNet):
            #.cuda()
           # test('骆驼祥子是是故事中的角色。',net,hidden_init.cuda(),CC_dict,coder,POS_dict,lstm=1)
            pass
        time.sleep(5)
        LossRecoder.append(L/n)
        torch.save(net,'C:\\Users\\15353\\Desktop\\net.pkl')
        torch.save(coder,'C:\\Users\\15353\\Desktop\\coder.pkl')
    return net,coder,hidden_init

2.4 测试模块

        测试时,需要把汉字转换成字符,因此需要如下函数:


def char2num(dict,inputs):
    num_list = [dict.get(c) for c in inputs]
    return num_list

        由于我训练时采用了MSE而不是交叉熵,因此需要比较输出和每个编码后的词性的“欧氏距离”,认为距离哪个最近就是哪个词性。

        测试函数如下:

def test(word,net,hidden_init,CCdict,coder,POSdict,lstm=False):
    if(not lstm):
        #.cuda()
        word_code,h=net(torch.LongTensor(char2num(CCdict,word)).cuda(),hidden_init)#[seq,2]
    if(lstm):
        #.cuda()
        word_code=net(torch.LongTensor(char2num(CCdict,word)).cuda())
    #.cuda()
    pos_codes=coder(torch.LongTensor(range(0,26)).cuda())
    
    lis=[]
    for i in range(word_code.shape[0]):
        minnum=10
        minindex=-1
        for j in range(pos_codes.shape[0]):
            dist=((word_code[i,:]-pos_codes[j,:])*(word_code[i,:]-pos_codes[j,:])).sum()
            if(dist<minnum):
                minindex=j
                minnum=dist
        for key, value in POSdict.items():
            if value == minindex:
                lis.append(key)
                break
    print(lis)
    return lis

        注:2.3 2.4中的代码都在functions.pyw内

2.5 主函数

import init_operations
import MyNet
import functions
import numpy as np


#生成合适的数据结构存储数据
CC_dict,POS_dict=init_operations.Seperate_ChineseCharacter_And_PartOfSpeech('train.txt','CC.txt','POS.txt')
y_true=np.loadtxt('POS.txt',delimiter=',')
x=np.loadtxt('CC.txt',delimiter=',')
print(CC_dict)
#训练并保存

net,coder,hidden_init=functions.train_model(x,y_true,MyNet.GRUNet,16, 32, 8,CC_dict,POS_dict,numlayers=1,epochs=50,learning_rate=0.004)

三. 训练效果

        训练效果如下图所示。我使用老舍的《骆驼祥子》作为数据集,大概两三个epoch就能有不错的效果。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值