一. 要求
1.1 环境
python pytorch numpy
1.2 目的
分别使用 RNN/LSTM/GRU实现中文自动分词
二. 代码编写
2.1设置字典
利用jieba对数据集进行分词。得到相关的数据集后,需要生成相应的字典。每个汉字对应字典中的一个数,每个词性也对应另一个字典中的某个数。考虑到每个词长短不同,如“笔”,“笔尖”二者都是名词,但长度不同,这可能对后续操作由一定影响。因此,分别这样记录他们的词性:“n”,"n $"。其中"$"代表该位置的汉字与前一个共同组成名词。代码如下:
import numpy as np
def Seperate_ChineseCharacter_And_PartOfSpeech(txtpath,ChineseCharacterPath,PartOfSpeechPath,char_dict= {'。' : 0,'!':1,'?':2,'”':3},char_order= [],char_dict2= {'$' : 0},char_order2= []):#分成俩txt
with open(txtpath, 'r') as f:
lines = f.readlines()
with open(ChineseCharacterPath, 'w') as f1, open(PartOfSpeechPath, 'w') as f2:
for line in lines:
parts = line.strip().split(' /')
if len(parts) == 2:
lis=ChineseCharacter2num(parts[0],char_dict,char_order)
for item in lis:
f1.write(str(item))
f1.write(',')
lenth=len(parts[0])
f2.write(str(PartOfSpeech2num(parts[1],char_dict2,char_order2)))
f2.write(',')
if(lenth>1):
for item in range(lenth-1):
f2.write('0')
f2.write(',')
to_delete(ChineseCharacterPath)
to_delete(PartOfSpeechPath)
return char_dict,char_dict2
def ChineseCharacter2num(text,char_dict= {'。' : 0,'!':1,'?':2},char_order= []):#将字符转化为数字
result = []
for char in text:
if char not in char_dict:
char_dict[char] = len(char_dict)
char_order.append(char)
result.append(char_dict[char])
return result
def PartOfSpeech2num(text,char_dict= {'$' : 0 },char_order= []):#词性转化为数字
if text not in char_dict:
char_dict[text] = len(char_dict)
char_order.append(text)
result=char_dict[text]
return result
def to_delete(path):
with open(path, 'r') as f:
content = f.read()
if content[-1] == ',':
content = content[:-1]
with open(path, 'w') as f:
f.write(content)
这部分代码被卸载init_operations.pyw里。
2.2 设置网络
这里分别设置了三种网络对应的类。LSTM不需要输入H0,另外两种网络需要输入H0。引入embedding层是因为1,2,3,4……这种编码不能体现词性或者汉字间的相近或相斥的关系。引入后,学习效果会好很多。
除了三种网络,我还定义了一个小网络,叫做coding。中午给我网络对词性进行了编码(而RNN网络的embedding是对汉字编码)。加入这个网络,是因为我主观的认为,对词性编码能更好的学习。当然,这个网络embedding后千万不能加linear,详见我之前的博客。
代码如下(functions 是我自己定义的模块):
#coding:gb2312
import torch
from functions import*
from torch import nn
class RnnNet(nn.Module):
def __init__(self, vector_lenth, hidden_size, output_size,numlayers,text):
super(RnnNet, self).__init__()
self.embedding = nn.Embedding(int(text.max())+10, vector_lenth).float()
self.rnn = nn.RNN(
input_size=vector_lenth,
hidden_size=hidden_size,
num_layers=numlayers,
batch_first=True,
bidirectional=False
)
for p in self.rnn.parameters():
nn.init.normal_(p, mean=0.0, std=0.001)
self.hidden_size=hidden_size
self.linear1 = nn.Linear(hidden_size, 8)
self.linear2 = nn.Linear(8, output_size)
def forward(self, x, hidden_init):
x=self.embedding(x)
out, hidden_prev = self.rnn(x, hidden_init)
out = out.view(-1, self.hidden_size)
out = self.linear1(out)
out=self.linear2(out)
return out.float(), hidden_prev
class coding(nn.Module):
def __init__(self, vector_lenth):
super(coding, self).__init__()
self.embedding = nn.Embedding(52, vector_lenth).float()
def forward(self,y):
y= self.embedding(y)
return y.float()
class LSTMNet(nn.Module):
def __init__(self, vector_lenth, hidden_size, output_size,numlayers,text):
super(LSTMNet, self).__init__()
self.embedding = nn.Embedding(int(text.max())+10, vector_lenth).float()
self.rnn = nn.LSTM(
input_size=vector_lenth,
hidden_size=hidden_size,
num_layers=numlayers,
batch_first=True,
bidirectional=False
)
for p in self.rnn.parameters():
nn.init.normal_(p, mean=0.0, std=0.001)
self.hidden_size=hidden_size
self.linear1 = nn.Linear(hidden_size, 8)
self.relu=nn.ELU()
self.linear2 = nn.Linear(8, output_size)
self.sgm=nn.Sigmoid()
def forward(self, x):
x=self.embedding(x)
out, (h,c) = self.rnn(x)
out = out.view(-1, self.hidden_size)
out = self.linear1(out)
out=self.linear2(out)
return out.float()
class GRUNet(nn.Module):
def __init__(self, vector_lenth, hidden_size, output_size,numlayers,text):
super(GRUNet, self).__init__()
self.embedding = nn.Embedding(int(text.max())+10, vector_lenth).float()
self.rnn = nn.GRU(
input_size=vector_lenth,
hidden_size=hidden_size,
num_layers=numlayers,
batch_first=True,
bidirectional=False
)
for p in self.rnn.parameters():
nn.init.normal_(p, mean=0.0, std=0.001)
self.hidden_size=hidden_size
self.linear1 = nn.Linear(hidden_size, 8)
self.relu=nn.ELU()
self.linear2 = nn.Linear(8, output_size)
self.sgm=nn.Sigmoid()
def forward(self, x, hidden_init):
x=self.embedding(x)
out, hidden_prev = self.rnn(x, hidden_init)
# [b, seq, h]
out = out.view(-1, self.hidden_size)
out = self.linear1(out)
out=self.linear2(out)
return out.float(), hidden_prev
这部分代码被写在MyNet.pyw里
2.3 训练模块
训练时。截取每句话,扔到GPU/CPU上训练。
我在写代码时有一处写的不规范:判断是否结束时没有用len(数据集)而是直接将数据集的长度填了进去。
代码如下:
#coding:gb2312
from platform import java_ver
import numpy as np
import torch
from functions import*
from torch import nn
import matplotlib.pyplot as plt
import init_operations
import random
import time
import MyNet
def train_model(seqs,y_true,model,vector_lenth, hidden_size, output_size,CC_dict,POS_dict,numlayers=1,epochs=1,learning_rate=0.001):
#.cuda()
coder=MyNet.coding(output_size).cuda()
#.cuda()
net = model(vector_lenth, hidden_size, output_size,numlayers,seqs).cuda()
LossFunction=nn.MSELoss()#定义损失函数
optimizer = torch.optim.Adam(net.parameters(), learning_rate)#选择优化器。建议adam
hidden_init = torch.zeros( numlayers, hidden_size)#层数
#.cuda()
hidden_prev=hidden_init.cuda()
LossRecoder=[]
for epoch in range(epochs):
start=0
end=0
L=0
n=0
while(end<133759):#此处修改
end=start+1
while(seqs[end]!=0 and seqs[end]!=1 and seqs[end]!=2):
end+=1
if(end<133759):#此处修改
if(seqs[end+1]==3):
end+=1
if(model!=MyNet.LSTMNet):
#.cuda()
output, hidden_prev = net(torch.LongTensor(seqs[start:end+1]).cuda(),hidden_init.cuda())
elif(model==MyNet.LSTMNet):
#.cuda()
output= net(torch.LongTensor(seqs[start:end+1]).cuda())
#.cuda()
loss = LossFunction(output, coder(torch.LongTensor(y_true[start:end+1]).reshape(-1,1).cuda()).reshape(-1,output_size))
#.cuda()
temp=torch.LongTensor(y_true[start:end+1]).reshape(-1,1).float().cuda().detach()
print('training......,epoch[',epoch+1,'/',epochs,']','sequence:',n,'loss=',loss.item())
L+=loss.item()
n+=1
net.zero_grad()
loss.backward()
optimizer.step()
start=end+1
if(model!=MyNet.LSTMNet):
pass
#.cuda()
#test('骆驼祥子拿着一台苹果手机。',net,hidden_init.cuda(),CC_dict,coder,POS_dict)
elif(model==MyNet.LSTMNet):
#.cuda()
# test('骆驼祥子是是故事中的角色。',net,hidden_init.cuda(),CC_dict,coder,POS_dict,lstm=1)
pass
time.sleep(5)
LossRecoder.append(L/n)
torch.save(net,'C:\\Users\\15353\\Desktop\\net.pkl')
torch.save(coder,'C:\\Users\\15353\\Desktop\\coder.pkl')
return net,coder,hidden_init
2.4 测试模块
测试时,需要把汉字转换成字符,因此需要如下函数:
def char2num(dict,inputs):
num_list = [dict.get(c) for c in inputs]
return num_list
由于我训练时采用了MSE而不是交叉熵,因此需要比较输出和每个编码后的词性的“欧氏距离”,认为距离哪个最近就是哪个词性。
测试函数如下:
def test(word,net,hidden_init,CCdict,coder,POSdict,lstm=False):
if(not lstm):
#.cuda()
word_code,h=net(torch.LongTensor(char2num(CCdict,word)).cuda(),hidden_init)#[seq,2]
if(lstm):
#.cuda()
word_code=net(torch.LongTensor(char2num(CCdict,word)).cuda())
#.cuda()
pos_codes=coder(torch.LongTensor(range(0,26)).cuda())
lis=[]
for i in range(word_code.shape[0]):
minnum=10
minindex=-1
for j in range(pos_codes.shape[0]):
dist=((word_code[i,:]-pos_codes[j,:])*(word_code[i,:]-pos_codes[j,:])).sum()
if(dist<minnum):
minindex=j
minnum=dist
for key, value in POSdict.items():
if value == minindex:
lis.append(key)
break
print(lis)
return lis
注:2.3 2.4中的代码都在functions.pyw内
2.5 主函数
import init_operations
import MyNet
import functions
import numpy as np
#生成合适的数据结构存储数据
CC_dict,POS_dict=init_operations.Seperate_ChineseCharacter_And_PartOfSpeech('train.txt','CC.txt','POS.txt')
y_true=np.loadtxt('POS.txt',delimiter=',')
x=np.loadtxt('CC.txt',delimiter=',')
print(CC_dict)
#训练并保存
net,coder,hidden_init=functions.train_model(x,y_true,MyNet.GRUNet,16, 32, 8,CC_dict,POS_dict,numlayers=1,epochs=50,learning_rate=0.004)
三. 训练效果
训练效果如下图所示。我使用老舍的《骆驼祥子》作为数据集,大概两三个epoch就能有不错的效果。