辛苦手码不易, 如若有帮助烦请不吝收藏点赞 😃
本文主要在学校实验的基础上,按照实现顺序对实验的实现进行了讲解
在这里主要分享的是生成宋词的部分
模型方法
在本次分享中,我选择基于上一篇博客得到的单双字宋词词频,自动生成宋词。
上一篇博客地址:
分词&统计词频
我选取了两种宋词生成方式。一是基于已有词频,选取轮盘赌的方式,基于词频概率进行随机生成;二是参考Bi-gram的思想,基于上文文本选取出现概率最高的进行生成。
def:
所谓bi-gram,即采用极大似然估计的思想,假设下一个词的出现只依赖于上一个词,在本实验中,我假设下一个字的出现仅和上一个字有关。即公式(1)中,B的出现仅和A有关。参照此思想,我设计了相关选词逻辑,在下一节进行说明。
系统设计
众所周知,宋词的创作顺序为,选好相应词牌名后,根据词牌名的字数、平仄、韵律等的要求,向规定的空位中填词,在这一点上,本实验的宋词生成器与其如出一辙,并使用Tkinter设计了图形界面。
我将系统主要设计为以下三步:
-
设计界面交互逻辑,本次实验我设计的交互逻辑为,由用户在输入框中输入词牌名,并选择文本生成算法(随机生成或bi-gram),选择完毕后点击“开始”按钮,即在下方文本框中显示宋词内容。交互逻辑确定后,设计好图形界面并保留对应接口,以便呈现结果。
-
对源文本进行进一步处理,将词牌名与词内容的句数字数对应。同样对文本文件按行读取,在这里,对每行遍历解码后,为规避实验一中出现的问题,我选择以有无符号作为区分词牌名与词内容的关键,若无符号,则直接压入词牌列表,若有符号,则将全部标点符号换为空格,并读取划分的单句字数,以一首词的字数作为列表,装有字数的列表再作为新的元素压入新的列表。由于理论上,字数列表与词牌名列表应是一一对应的,故直接将其用字典结构拼接起来,由此,一个词牌对应一个字数列表,例如:‘浣溪沙’= [3,3,4,5,7,4,5,3,3,4,5,7,4,5,0](零是由于设计读取数字数的逻辑造成的,这里刚好可以让其作为一首词结尾的标识位)。
GUI代码实现:
class GUI:
def __init__(self):
self.window=Tk()
self.window.geometry('720x680')
self.window.title('宋词生成器')
self.sentence_For_cook=''
self.v1 = IntVar() # 获取算法选择结果
self.v1.set(0)
self.v2=StringVar()#获取客户输入的句子
sh = [',', '。', '!', '、', '?', '【', '】']
self.ci = {}
self.algorithmrandom = algorithmrandom()
self.BGram=BGram()
with open('Ci.txt', 'rb') as res_file:
lines = res_file.readlines()
newlines = []
word_nums = []
self.cipai = []
for line in lines:
new_line = line.decode('gbk', 'ignore')
if new_line == '\r\n': # 消去换行符
continue
newlines.append(new_line[:-2])
for each in newlines:
char_nums = []
if '。' not in each:
self.cipai.append(each)
continue
for s in sh:
each = each.replace(s, ' ')
sentence = each.split(' ')
for each_sen in sentence:
char_nums.append(len(each_sen))
word_nums.append(char_nums)
count=-1
for eachcipai in self.cipai:
count += 1
if eachcipai in self.ci.keys():
continue
if count==len(word_nums)-1:
break
self.ci[eachcipai] = word_nums[count]
frame=Frame(self.window)
frame1=Frame(self.window)
input_Sentence = Entry(frame1,textvariable=self.v2,justify=LEFT,width=25,font=('宋体',13))
frame.pack()
frame1.pack()
welcome=Label(frame,text='苏轼模拟器',font=('黑体',20),anchor='center',bg='red')
please_input=Label(frame1,text='·请输入词牌名:',font=('黑体',13),anchor='n')
algo_choose = Label(frame1, text='·请选择生成算法:', font=('黑体', 13))
execute_Button = Button(frame1, text='开始', font=('黑体', 15), command=self.printWords)
exit_Button=Button(frame1,text='退出',font=('黑体',15),command=exit)
words_output=Label(frame1,text='以下为生成结果:',font=('黑体',13))
self.result_print=Text(frame1,font=('楷体,16'),width=50,height=25)
self.result_print.insert(INSERT, '词牌名举例:酒泉子、苏幕遮、甘草子、送征衣、昼夜乐、西江月、玉楼春、惜春郎、永遇乐、卜算子、尉迟杯、巫山一段云、婆罗门令 and so on...')
welcome.grid()
please_input.grid(row=1, column=0)
input_Sentence.grid(row=2)
algo_choose.grid()
algos = [('基于概率的随机生成', 1), ('B-gram', 2)]
for algo, num in algos:
b = Radiobutton(frame1, text=algo, variable=self.v1, value=num, indicatoron=True) # indicatoron只是改变了按钮样式
b.grid(column=0)
execute_Button.grid(row=7, column=0)
exit_Button.grid(row=7, column=1)
words_output.grid()
self.result_print.grid()
self.window.mainloop()
def exit(self):
exit(0)
def __getAlgo(self):
algo=self.v1.get()
return algo
def printWords(self):
self.result_print.delete('1.0','end')
if self.__getAlgo()==0:
messagebox.showinfo('Error','请先选择生成算法!')
if self.v2.get()=='':
messagebox.showinfo('Error', '请输入词牌名!')
if self.__getAlgo()==1:
result=self.algorithmrandom.ran(self.v2.get(),self.ci)
self.result_print.insert(INSERT,result)
if self.__getAlgo() == 2:
result = self.BGram.B_gram(self.v2.get(), self.ci)
self.result_print.insert(INSERT, result)
- 对文本进行处理后,则可以专注于编写保留的算法接口。在这里,我设计了基于词频概率的随机生成与bi-gram生成算法。
对于随机生成,我统计了单双字每个词的出现概率后(即用总数作分母来除各词的对应词频),开始使用轮盘赌的方式填词。首先,检测到输入词牌后,依据其每句字数填充,每句首字选取单字的轮盘赌生成,后续只要有两字的空间,就选取两字的轮盘赌随机生成,空间不够则选取单字的轮盘赌生成,此为随机生成算法。
随机生成算法实现:
class algorithmrandom:
# 第一个字由轮盘赌随机生成
# 后续只要字数够,均选用两字的轮盘赌生成
def __init__(self):
file1 = open('char_result_n1.txt', 'r', encoding='gbk')
file2 = open('char_result_n2.txt', 'r', encoding='gbk')
charlist1 = {}
charlist2 = {}
self.charlist1_chance = {}
self.charlist2_chance = {}
for each in file1:
charlist1[each[:1]] = each[2:-1]
self.charlist1_chance[each[:1]]=0
for each in file2:
charlist2[each[:2]] = each[3:-1]
self.charlist2_chance[each[:2]] = 0
self.key_n1List=list(charlist1.keys())
self.key_n2List = list(charlist2.keys())
num1List=list(charlist1.values())
l1=[]
num2List = list(charlist2.values())
l2=[]
for each in num1List:
if each == '':
l1.append(0)
continue
l1.append(int(each))
for each in num2List:
if each == '':
l2.append(0)
continue
l2.append(int(each))
self.sum1=sum(l1)
self.sum2=sum(l2)
self.chance1=[]
self.chance2=[]
for each in l1:
self.chance1.append(float(each) / self.sum1)
for each in l2:
self.chance2.append(float(each) / self.sum2)
count1=0
for eachkey in self.charlist1_chance.keys():
self.charlist1_chance[eachkey] = self.chance1[count1]
count1+=1
if count1==len(self.chance1)-1:
break
count2=0
for eachkey in self.charlist2_chance.keys():
self.charlist2_chance[eachkey] = self.chance2[count2]
count2 += 1
if count2==len(self.chance2)-1:
break
def ran(self,cipai,dict_cipai):
word_nums=dict_cipai[cipai]
words = []
sentences=[]
for eachnum in word_nums:
i=0
if eachnum==0:
break
words.clear()
while i<eachnum:
if i==0:
word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
words.append(word[0])
i=i+1
if eachnum-i>1:
word = np.random.choice(self.key_n2List, 1, replace=False,p=self.chance2)
words.append(word[0])
i+=2
# words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
if i==eachnum:
break
word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
# word = np.random.choice(self.key_n1List, 1, replace=False)
words.append(word[0])
i+=1
sentences.append(''.join(words))
return sentences
- 对于bi-gram,则稍显复杂。同样的,每句首字选取单字的轮盘赌,后续即遍历两字的词典,如若sentence[-1]==word[0],即上一个字与双字词的首字相同,则将这个双字词压入待选列表,最终选取列表中词频最高的双字词填入句子。
bi-gram算法实现:
class BGram:
def __init__(self):
file1 = open('char_result_n1.txt', 'r', encoding='gbk')
file2 = open('char_result_n2.txt', 'r', encoding='gbk')
charlist1 = {}
charlist2 = {}
self.charlist1_chance = {}
self.charlist2_chance = {}
for each in file1:
charlist1[each[:1]] = each[2:-1]
self.charlist1_chance[each[:1]]=0
for each in file2:
charlist2[each[:2]] = each[3:-1]
self.charlist2_chance[each[:2]] = 0
self.key_n1List=list(charlist1.keys())
self.key_n2List = list(charlist2.keys())
num1List=list(charlist1.values())
l1=[]
num2List = list(charlist2.values())
l2=[]
for each in num1List:
if each == '':
l1.append(0)
continue
l1.append(int(each))
for each in num2List:
if each == '':
l2.append(0)
continue
l2.append(int(each))
self.sum1=sum(l1)
self.sum2=sum(l2)
self.chance1=[]
self.chance2=[]
for each in l1:
self.chance1.append(float(each) / self.sum1)
for each in l2:
self.chance2.append(float(each) / self.sum2)
count1=0
for eachkey in self.charlist1_chance.keys():
self.charlist1_chance[eachkey] = self.chance1[count1]
count1+=1
if count1==len(self.chance1)-1:
break
# for eachval in l1:
# self.charlist1_chance[eachkey]=eachval
count2=0
for eachkey in self.charlist2_chance.keys():
self.charlist2_chance[eachkey] = self.chance2[count2]
count2 += 1
if count2==len(self.chance2)-1:
break
def B_gram(self,cipai,dict_cipai):
word_nums=dict_cipai[cipai]
words = []
sentences=[]
words_maybe=[]
chance_maybe=[]
newchance_maybe={}
for eachnum in word_nums:
i=0
if eachnum==0:
break
words.clear()
words_maybe.clear()
chance_maybe.clear()
newchance_maybe.clear()
while i<eachnum:
if i==0:
word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
words.append(word[0])
i=i+1
if eachnum-i>1:
for each_2word in self.key_n2List:
if words[-1] == each_2word[0]:
words_maybe.append(each_2word)
chance_maybe.append(self.charlist2_chance[each_2word])
newchance_maybe[each_2word]=self.charlist2_chance[each_2word]
if words_maybe!=None:
# word=np.random.choice(words_maybe,1,replace=False,p=newchance_maybe)
list=sorted(newchance_maybe.items(),key=lambda x:x[1],reverse=True)
word = list[0]
words.append(word[0][1])
i+=1
# words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
if i==eachnum:
break
word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
# word = np.random.choice(self.key_n1List, 1, replace=False)
words.append(word[0])
i+=1
sentences.append(''.join(words))
return sentences
效果演示
random:
BI-GRAM:
如上图,可以看到,bi-gram的生成效果还是要优于随机生成的。
由于实验一中的分词的原因,在实验二中不可避免地,出现了乱码问题:
同样的,想要消除上述问题,只有在文本处理上下功夫,处理方式如实验一中所说的,直接使用编码的方式来排除,应该可以彻底规避上述问题。
完整代码附录
import tkinter.messagebox as messagebox
from tkinter import *
import numpy as np
class GUI:
def __init__(self):
self.window=Tk()
self.window.geometry('720x680')
self.window.title('宋词生成器')
self.sentence_For_cook=''
self.v1 = IntVar() # 获取算法选择结果
self.v1.set(0)
self.v2=StringVar()#获取客户输入的句子
sh = [',', '。', '!', '、', '?', '【', '】']
self.ci = {}
self.algorithmrandom = algorithmrandom()
self.BGram=BGram()
with open('Ci.txt', 'rb') as res_file:
lines = res_file.readlines()
newlines = []
word_nums = []
self.cipai = []
for line in lines:
new_line = line.decode('gbk', 'ignore')
if new_line == '\r\n': # 消去换行符
continue
newlines.append(new_line[:-2])
for each in newlines:
char_nums = []
if '。' not in each:
self.cipai.append(each)
continue
for s in sh:
each = each.replace(s, ' ')
sentence = each.split(' ')
for each_sen in sentence:
char_nums.append(len(each_sen))
word_nums.append(char_nums)
count=-1
for eachcipai in self.cipai:
count += 1
if eachcipai in self.ci.keys():
continue
if count==len(word_nums)-1:
break
self.ci[eachcipai] = word_nums[count]
frame=Frame(self.window)
frame1=Frame(self.window)
input_Sentence = Entry(frame1,textvariable=self.v2,justify=LEFT,width=25,font=('宋体',13))
frame.pack()
frame1.pack()
welcome=Label(frame,text='苏轼模拟器',font=('黑体',20),anchor='center',bg='red')
please_input=Label(frame1,text='·请输入词牌名:',font=('黑体',13),anchor='n')
algo_choose = Label(frame1, text='·请选择生成算法:', font=('黑体', 13))
execute_Button = Button(frame1, text='开始', font=('黑体', 15), command=self.printWords)
exit_Button=Button(frame1,text='退出',font=('黑体',15),command=exit)
words_output=Label(frame1,text='以下为生成结果:',font=('黑体',13))
self.result_print=Text(frame1,font=('楷体,16'),width=50,height=25)
self.result_print.insert(INSERT, '词牌名举例:酒泉子、苏幕遮、甘草子、送征衣、昼夜乐、西江月、玉楼春、惜春郎、永遇乐、卜算子、尉迟杯、巫山一段云、婆罗门令 and so on...')
welcome.grid()
please_input.grid(row=1, column=0)
input_Sentence.grid(row=2)
algo_choose.grid()
algos = [('基于概率的随机生成', 1), ('B-gram', 2)]
for algo, num in algos:
b = Radiobutton(frame1, text=algo, variable=self.v1, value=num, indicatoron=True) # indicatoron只是改变了按钮样式
b.grid(column=0)
execute_Button.grid(row=7, column=0)
exit_Button.grid(row=7, column=1)
words_output.grid()
self.result_print.grid()
self.window.mainloop()
def exit(self):
exit(0)
def __getAlgo(self):
algo=self.v1.get()
return algo
def printWords(self):
self.result_print.delete('1.0','end')
if self.__getAlgo()==0:
messagebox.showinfo('Error','请先选择生成算法!')
if self.v2.get()=='':
messagebox.showinfo('Error', '请输入词牌名!')
if self.__getAlgo()==1:
result=self.algorithmrandom.ran(self.v2.get(),self.ci)
self.result_print.insert(INSERT,result)
if self.__getAlgo() == 2:
result = self.BGram.B_gram(self.v2.get(), self.ci)
self.result_print.insert(INSERT, result)
class algorithmrandom:
# 第一个字由轮盘赌随机生成
# 后续只要字数够,均选用两字的轮盘赌生成
def __init__(self):
file1 = open('char_result_n1.txt', 'r', encoding='gbk')
file2 = open('char_result_n2.txt', 'r', encoding='gbk')
charlist1 = {}
charlist2 = {}
self.charlist1_chance = {}
self.charlist2_chance = {}
for each in file1:
charlist1[each[:1]] = each[2:-1]
self.charlist1_chance[each[:1]]=0
for each in file2:
charlist2[each[:2]] = each[3:-1]
self.charlist2_chance[each[:2]] = 0
self.key_n1List=list(charlist1.keys())
self.key_n2List = list(charlist2.keys())
num1List=list(charlist1.values())
l1=[]
num2List = list(charlist2.values())
l2=[]
for each in num1List:
if each == '':
l1.append(0)
continue
l1.append(int(each))
for each in num2List:
if each == '':
l2.append(0)
continue
l2.append(int(each))
self.sum1=sum(l1)
self.sum2=sum(l2)
self.chance1=[]
self.chance2=[]
for each in l1:
self.chance1.append(float(each) / self.sum1)
for each in l2:
self.chance2.append(float(each) / self.sum2)
count1=0
for eachkey in self.charlist1_chance.keys():
self.charlist1_chance[eachkey] = self.chance1[count1]
count1+=1
if count1==len(self.chance1)-1:
break
count2=0
for eachkey in self.charlist2_chance.keys():
self.charlist2_chance[eachkey] = self.chance2[count2]
count2 += 1
if count2==len(self.chance2)-1:
break
def ran(self,cipai,dict_cipai):
word_nums=dict_cipai[cipai]
words = []
sentences=[]
for eachnum in word_nums:
i=0
if eachnum==0:
break
words.clear()
while i<eachnum:
if i==0:
word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
words.append(word[0])
i=i+1
if eachnum-i>1:
word = np.random.choice(self.key_n2List, 1, replace=False,p=self.chance2)
words.append(word[0])
i+=2
# words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
if i==eachnum:
break
word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
# word = np.random.choice(self.key_n1List, 1, replace=False)
words.append(word[0])
i+=1
sentences.append(''.join(words))
return sentences
class BGram:
def __init__(self):
file1 = open('char_result_n1.txt', 'r', encoding='gbk')
file2 = open('char_result_n2.txt', 'r', encoding='gbk')
charlist1 = {}
charlist2 = {}
self.charlist1_chance = {}
self.charlist2_chance = {}
for each in file1:
charlist1[each[:1]] = each[2:-1]
self.charlist1_chance[each[:1]]=0
for each in file2:
charlist2[each[:2]] = each[3:-1]
self.charlist2_chance[each[:2]] = 0
self.key_n1List=list(charlist1.keys())
self.key_n2List = list(charlist2.keys())
num1List=list(charlist1.values())
l1=[]
num2List = list(charlist2.values())
l2=[]
for each in num1List:
if each == '':
l1.append(0)
continue
l1.append(int(each))
for each in num2List:
if each == '':
l2.append(0)
continue
l2.append(int(each))
self.sum1=sum(l1)
self.sum2=sum(l2)
self.chance1=[]
self.chance2=[]
for each in l1:
self.chance1.append(float(each) / self.sum1)
for each in l2:
self.chance2.append(float(each) / self.sum2)
count1=0
for eachkey in self.charlist1_chance.keys():
self.charlist1_chance[eachkey] = self.chance1[count1]
count1+=1
if count1==len(self.chance1)-1:
break
# for eachval in l1:
# self.charlist1_chance[eachkey]=eachval
count2=0
for eachkey in self.charlist2_chance.keys():
self.charlist2_chance[eachkey] = self.chance2[count2]
count2 += 1
if count2==len(self.chance2)-1:
break
def B_gram(self,cipai,dict_cipai):
word_nums=dict_cipai[cipai]
words = []
sentences=[]
words_maybe=[]
chance_maybe=[]
newchance_maybe={}
for eachnum in word_nums:
i=0
if eachnum==0:
break
words.clear()
words_maybe.clear()
chance_maybe.clear()
newchance_maybe.clear()
while i<eachnum:
if i==0:
word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
words.append(word[0])
i=i+1
if eachnum-i>1:
for each_2word in self.key_n2List:
if words[-1] == each_2word[0]:
words_maybe.append(each_2word)
chance_maybe.append(self.charlist2_chance[each_2word])
newchance_maybe[each_2word]=self.charlist2_chance[each_2word]
if words_maybe!=None:
# word=np.random.choice(words_maybe,1,replace=False,p=newchance_maybe)
list=sorted(newchance_maybe.items(),key=lambda x:x[1],reverse=True)
word = list[0]
words.append(word[0][1])
i+=1
# words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
if i==eachnum:
break
word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
# word = np.random.choice(self.key_n1List, 1, replace=False)
words.append(word[0])
i+=1
sentences.append(''.join(words))
return sentences
gui=GUI()