Python实现宋词生成(带界面)

Autumn_begins

已于 2023-06-11 14:46:52 修改

阅读量1k

点赞数 2

分类专栏： Python实现宋词生成(NLP小白) 文章标签： python nlp 自然语言处理中文分词

于 2023-06-03 15:09:03 首次发布

本文链接：https://blog.csdn.net/m0_51373023/article/details/131021296

版权

Python实现宋词生成(NLP小白) 专栏收录该内容

2 篇文章

订阅专栏

辛苦手码不易, 如若有帮助烦请不吝收藏点赞 😃
本文主要在学校实验的基础上，按照实现顺序对实验的实现进行了讲解
在这里主要分享的是生成宋词的部分

模型方法

在本次分享中，我选择基于上一篇博客得到的单双字宋词词频，自动生成宋词。
上一篇博客地址:
分词&统计词频

我选取了两种宋词生成方式。一是基于已有词频，选取轮盘赌的方式，基于词频概率进行随机生成；二是参考Bi-gram的思想，基于上文文本选取出现概率最高的进行生成。
def:
所谓bi-gram，即采用极大似然估计的思想，假设下一个词的出现只依赖于上一个词，在本实验中，我假设下一个字的出现仅和上一个字有关。即公式(1)中，B的出现仅和A有关。参照此思想，我设计了相关选词逻辑，在下一节进行说明。
在这里插入图片描述

系统设计

众所周知，宋词的创作顺序为，选好相应词牌名后，根据词牌名的字数、平仄、韵律等的要求，向规定的空位中填词，在这一点上，本实验的宋词生成器与其如出一辙，并使用Tkinter设计了图形界面。
我将系统主要设计为以下三步：

设计界面交互逻辑，本次实验我设计的交互逻辑为，由用户在输入框中输入词牌名，并选择文本生成算法（随机生成或bi-gram），选择完毕后点击“开始”按钮，即在下方文本框中显示宋词内容。交互逻辑确定后，设计好图形界面并保留对应接口，以便呈现结果。
对源文本进行进一步处理，将词牌名与词内容的句数字数对应。同样对文本文件按行读取，在这里，对每行遍历解码后，为规避实验一中出现的问题，我选择以有无符号作为区分词牌名与词内容的关键，若无符号，则直接压入词牌列表，若有符号，则将全部标点符号换为空格，并读取划分的单句字数，以一首词的字数作为列表，装有字数的列表再作为新的元素压入新的列表。由于理论上，字数列表与词牌名列表应是一一对应的，故直接将其用字典结构拼接起来，由此，一个词牌对应一个字数列表，例如：‘浣溪沙’= [3,3,4,5,7,4,5,3,3,4,5,7,4,5,0]（零是由于设计读取数字数的逻辑造成的，这里刚好可以让其作为一首词结尾的标识位）。
GUI代码实现:

class GUI:
    def __init__(self):

        self.window=Tk()
        self.window.geometry('720x680')
        self.window.title('宋词生成器')
        self.sentence_For_cook=''
        self.v1 = IntVar()  # 获取算法选择结果
        self.v1.set(0)
        self.v2=StringVar()#获取客户输入的句子
        sh = ['，', '。', '！', '、', '？', '【', '】']
        self.ci = {}
        self.algorithmrandom = algorithmrandom()
        self.BGram=BGram()
        with open('Ci.txt', 'rb') as res_file:
            lines = res_file.readlines()
            newlines = []
            word_nums = []

            self.cipai = []
            for line in lines:
                new_line = line.decode('gbk', 'ignore')
                if new_line == '\r\n':  # 消去换行符
                    continue
                newlines.append(new_line[:-2])
            for each in newlines:
                char_nums = []
                if '。' not in each:
                    self.cipai.append(each)
                    continue
                for s in sh:
                    each = each.replace(s, ' ')
                    sentence = each.split(' ')

                for each_sen in sentence:
                    char_nums.append(len(each_sen))
                word_nums.append(char_nums)
            count=-1
        for eachcipai in self.cipai:
            count += 1
            if eachcipai in self.ci.keys():
                continue
            if count==len(word_nums)-1:
                break
            self.ci[eachcipai] = word_nums[count]

        frame=Frame(self.window)
        frame1=Frame(self.window)

        input_Sentence = Entry(frame1,textvariable=self.v2,justify=LEFT,width=25,font=('宋体',13))
        frame.pack()
        frame1.pack()

        welcome=Label(frame,text='苏轼模拟器',font=('黑体',20),anchor='center',bg='red')

        please_input=Label(frame1,text='·请输入词牌名：',font=('黑体',13),anchor='n')
        algo_choose = Label(frame1, text='·请选择生成算法：', font=('黑体', 13))
        execute_Button = Button(frame1, text='开始', font=('黑体', 15), command=self.printWords)
        exit_Button=Button(frame1,text='退出',font=('黑体',15),command=exit)
        words_output=Label(frame1,text='以下为生成结果：',font=('黑体',13))
        self.result_print=Text(frame1,font=('楷体,16'),width=50,height=25)
        self.result_print.insert(INSERT, '词牌名举例：酒泉子、苏幕遮、甘草子、送征衣、昼夜乐、西江月、玉楼春、惜春郎、永遇乐、卜算子、尉迟杯、巫山一段云、婆罗门令 and so on...')



        welcome.grid()

        please_input.grid(row=1, column=0)
        input_Sentence.grid(row=2)
        algo_choose.grid()
        algos = [('基于概率的随机生成', 1), ('B-gram', 2)]
        for algo, num in algos:
            b = Radiobutton(frame1, text=algo, variable=self.v1, value=num, indicatoron=True)  # indicatoron只是改变了按钮样式
            b.grid(column=0)
        execute_Button.grid(row=7, column=0)



        exit_Button.grid(row=7, column=1)
        words_output.grid()
        self.result_print.grid()
        self.window.mainloop()


    def exit(self):
        exit(0)

    def __getAlgo(self):
        algo=self.v1.get()
        return algo

    def printWords(self):
        self.result_print.delete('1.0','end')
        if self.__getAlgo()==0:
            messagebox.showinfo('Error','请先选择生成算法！')
        if self.v2.get()=='':
            messagebox.showinfo('Error', '请输入词牌名！')
        if self.__getAlgo()==1:
            result=self.algorithmrandom.ran(self.v2.get(),self.ci)
            self.result_print.insert(INSERT,result)
        if self.__getAlgo() == 2:
            result = self.BGram.B_gram(self.v2.get(), self.ci)
            self.result_print.insert(INSERT, result)

对文本进行处理后，则可以专注于编写保留的算法接口。在这里，我设计了基于词频概率的随机生成与bi-gram生成算法。
对于随机生成，我统计了单双字每个词的出现概率后（即用总数作分母来除各词的对应词频），开始使用轮盘赌的方式填词。首先，检测到输入词牌后，依据其每句字数填充，每句首字选取单字的轮盘赌生成，后续只要有两字的空间，就选取两字的轮盘赌随机生成，空间不够则选取单字的轮盘赌生成，此为随机生成算法。
随机生成算法实现:

class algorithmrandom:
    # 第一个字由轮盘赌随机生成
    # 后续只要字数够，均选用两字的轮盘赌生成
    def __init__(self):
        file1 = open('char_result_n1.txt', 'r', encoding='gbk')
        file2 = open('char_result_n2.txt', 'r', encoding='gbk')
        charlist1 = {}
        charlist2 = {}
        self.charlist1_chance = {}
        self.charlist2_chance = {}
        for each in file1:
            charlist1[each[:1]] = each[2:-1]
            self.charlist1_chance[each[:1]]=0
        for each in file2:
            charlist2[each[:2]] = each[3:-1]
            self.charlist2_chance[each[:2]] = 0
        self.key_n1List=list(charlist1.keys())
        self.key_n2List = list(charlist2.keys())
        num1List=list(charlist1.values())
        l1=[]
        num2List = list(charlist2.values())
        l2=[]

        for each in num1List:
            if each == '':
                l1.append(0)
                continue
            l1.append(int(each))
        for each in num2List:
            if each == '':
                l2.append(0)
                continue
            l2.append(int(each))
        self.sum1=sum(l1)
        self.sum2=sum(l2)
        self.chance1=[]
        self.chance2=[]
        for each in l1:
            self.chance1.append(float(each) / self.sum1)
        for each in l2:
            self.chance2.append(float(each) / self.sum2)
        count1=0
        for eachkey in self.charlist1_chance.keys():
            self.charlist1_chance[eachkey] = self.chance1[count1]
            count1+=1
            if count1==len(self.chance1)-1:
                break

        count2=0
        for eachkey in self.charlist2_chance.keys():
            self.charlist2_chance[eachkey] = self.chance2[count2]
            count2 += 1
            if count2==len(self.chance2)-1:
                break


    def ran(self,cipai,dict_cipai):
        word_nums=dict_cipai[cipai]
        words = []
        sentences=[]
        for eachnum in word_nums:
            i=0
            if eachnum==0:
                break
            words.clear()
            while i<eachnum:
                if i==0:
                    word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
                    words.append(word[0])
                    i=i+1
                if eachnum-i>1:

                    word = np.random.choice(self.key_n2List, 1, replace=False,p=self.chance2)
                    words.append(word[0])
                    i+=2
                # words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
                if i==eachnum:
                    break
                word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
                # word = np.random.choice(self.key_n1List, 1, replace=False)
                words.append(word[0])
                i+=1
            sentences.append(''.join(words))
        return sentences

对于bi-gram，则稍显复杂。同样的，每句首字选取单字的轮盘赌，后续即遍历两字的词典，如若sentence[-1]==word[0]，即上一个字与双字词的首字相同，则将这个双字词压入待选列表，最终选取列表中词频最高的双字词填入句子。
bi-gram算法实现:

class BGram:

    def __init__(self):
        file1 = open('char_result_n1.txt', 'r', encoding='gbk')
        file2 = open('char_result_n2.txt', 'r', encoding='gbk')
        charlist1 = {}
        charlist2 = {}
        self.charlist1_chance = {}
        self.charlist2_chance = {}
        for each in file1:
            charlist1[each[:1]] = each[2:-1]
            self.charlist1_chance[each[:1]]=0
        for each in file2:
            charlist2[each[:2]] = each[3:-1]
            self.charlist2_chance[each[:2]] = 0
        self.key_n1List=list(charlist1.keys())
        self.key_n2List = list(charlist2.keys())
        num1List=list(charlist1.values())
        l1=[]
        num2List = list(charlist2.values())
        l2=[]

        for each in num1List:
            if each == '':
                l1.append(0)
                continue
            l1.append(int(each))
        for each in num2List:
            if each == '':
                l2.append(0)
                continue
            l2.append(int(each))
        self.sum1=sum(l1)
        self.sum2=sum(l2)
        self.chance1=[]
        self.chance2=[]
        for each in l1:
            self.chance1.append(float(each) / self.sum1)
        for each in l2:
            self.chance2.append(float(each) / self.sum2)
        count1=0
        for eachkey in self.charlist1_chance.keys():
            self.charlist1_chance[eachkey] = self.chance1[count1]
            count1+=1
            if count1==len(self.chance1)-1:
                break
            # for eachval in l1:
            #     self.charlist1_chance[eachkey]=eachval
        count2=0
        for eachkey in self.charlist2_chance.keys():
            self.charlist2_chance[eachkey] = self.chance2[count2]
            count2 += 1
            if count2==len(self.chance2)-1:
                break

    def B_gram(self,cipai,dict_cipai):
        word_nums=dict_cipai[cipai]
        words = []
        sentences=[]
        words_maybe=[]
        chance_maybe=[]
        newchance_maybe={}
        for eachnum in word_nums:
            i=0
            if eachnum==0:
                break
            words.clear()
            words_maybe.clear()
            chance_maybe.clear()
            newchance_maybe.clear()
            while i<eachnum:
                if i==0:
                    word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
                    words.append(word[0])
                    i=i+1
                if eachnum-i>1:
                    for each_2word in self.key_n2List:
                        if words[-1] == each_2word[0]:
                            words_maybe.append(each_2word)
                            chance_maybe.append(self.charlist2_chance[each_2word])
                            newchance_maybe[each_2word]=self.charlist2_chance[each_2word]

                    if words_maybe!=None:
                        # word=np.random.choice(words_maybe,1,replace=False,p=newchance_maybe)
                        list=sorted(newchance_maybe.items(),key=lambda x:x[1],reverse=True)
                        word = list[0]
                        words.append(word[0][1])
                        i+=1
                # words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
                if i==eachnum:
                    break
                word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
                # word = np.random.choice(self.key_n1List, 1, replace=False)
                words.append(word[0])
                i+=1
            sentences.append(''.join(words))
        return sentences

效果演示

random:
在这里插入图片描述

BI-GRAM:
在这里插入图片描述

如上图，可以看到，bi-gram的生成效果还是要优于随机生成的。
由于实验一中的分词的原因，在实验二中不可避免地，出现了乱码问题：
在这里插入图片描述同样的，想要消除上述问题，只有在文本处理上下功夫，处理方式如实验一中所说的，直接使用编码的方式来排除，应该可以彻底规避上述问题。

完整代码附录

import tkinter.messagebox as messagebox
from tkinter import *

import numpy as np


class GUI:
    def __init__(self):

        self.window=Tk()
        self.window.geometry('720x680')
        self.window.title('宋词生成器')
        self.sentence_For_cook=''
        self.v1 = IntVar()  # 获取算法选择结果
        self.v1.set(0)
        self.v2=StringVar()#获取客户输入的句子
        sh = ['，', '。', '！', '、', '？', '【', '】']
        self.ci = {}
        self.algorithmrandom = algorithmrandom()
        self.BGram=BGram()
        with open('Ci.txt', 'rb') as res_file:
            lines = res_file.readlines()
            newlines = []
            word_nums = []

            self.cipai = []
            for line in lines:
                new_line = line.decode('gbk', 'ignore')
                if new_line == '\r\n':  # 消去换行符
                    continue
                newlines.append(new_line[:-2])
            for each in newlines:
                char_nums = []
                if '。' not in each:
                    self.cipai.append(each)
                    continue
                for s in sh:
                    each = each.replace(s, ' ')
                    sentence = each.split(' ')

                for each_sen in sentence:
                    char_nums.append(len(each_sen))
                word_nums.append(char_nums)
            count=-1
        for eachcipai in self.cipai:
            count += 1
            if eachcipai in self.ci.keys():
                continue
            if count==len(word_nums)-1:
                break
            self.ci[eachcipai] = word_nums[count]

        frame=Frame(self.window)
        frame1=Frame(self.window)

        input_Sentence = Entry(frame1,textvariable=self.v2,justify=LEFT,width=25,font=('宋体',13))
        frame.pack()
        frame1.pack()

        welcome=Label(frame,text='苏轼模拟器',font=('黑体',20),anchor='center',bg='red')

        please_input=Label(frame1,text='·请输入词牌名：',font=('黑体',13),anchor='n')
        algo_choose = Label(frame1, text='·请选择生成算法：', font=('黑体', 13))
        execute_Button = Button(frame1, text='开始', font=('黑体', 15), command=self.printWords)
        exit_Button=Button(frame1,text='退出',font=('黑体',15),command=exit)
        words_output=Label(frame1,text='以下为生成结果：',font=('黑体',13))
        self.result_print=Text(frame1,font=('楷体,16'),width=50,height=25)
        self.result_print.insert(INSERT, '词牌名举例：酒泉子、苏幕遮、甘草子、送征衣、昼夜乐、西江月、玉楼春、惜春郎、永遇乐、卜算子、尉迟杯、巫山一段云、婆罗门令 and so on...')



        welcome.grid()

        please_input.grid(row=1, column=0)
        input_Sentence.grid(row=2)
        algo_choose.grid()
        algos = [('基于概率的随机生成', 1), ('B-gram', 2)]
        for algo, num in algos:
            b = Radiobutton(frame1, text=algo, variable=self.v1, value=num, indicatoron=True)  # indicatoron只是改变了按钮样式
            b.grid(column=0)
        execute_Button.grid(row=7, column=0)



        exit_Button.grid(row=7, column=1)
        words_output.grid()
        self.result_print.grid()
        self.window.mainloop()


    def exit(self):
        exit(0)

    def __getAlgo(self):
        algo=self.v1.get()
        return algo

    def printWords(self):
        self.result_print.delete('1.0','end')
        if self.__getAlgo()==0:
            messagebox.showinfo('Error','请先选择生成算法！')
        if self.v2.get()=='':
            messagebox.showinfo('Error', '请输入词牌名！')
        if self.__getAlgo()==1:
            result=self.algorithmrandom.ran(self.v2.get(),self.ci)
            self.result_print.insert(INSERT,result)
        if self.__getAlgo() == 2:
            result = self.BGram.B_gram(self.v2.get(), self.ci)
            self.result_print.insert(INSERT, result)


class algorithmrandom:
    # 第一个字由轮盘赌随机生成
    # 后续只要字数够，均选用两字的轮盘赌生成
    def __init__(self):
        file1 = open('char_result_n1.txt', 'r', encoding='gbk')
        file2 = open('char_result_n2.txt', 'r', encoding='gbk')
        charlist1 = {}
        charlist2 = {}
        self.charlist1_chance = {}
        self.charlist2_chance = {}
        for each in file1:
            charlist1[each[:1]] = each[2:-1]
            self.charlist1_chance[each[:1]]=0
        for each in file2:
            charlist2[each[:2]] = each[3:-1]
            self.charlist2_chance[each[:2]] = 0
        self.key_n1List=list(charlist1.keys())
        self.key_n2List = list(charlist2.keys())
        num1List=list(charlist1.values())
        l1=[]
        num2List = list(charlist2.values())
        l2=[]

        for each in num1List:
            if each == '':
                l1.append(0)
                continue
            l1.append(int(each))
        for each in num2List:
            if each == '':
                l2.append(0)
                continue
            l2.append(int(each))
        self.sum1=sum(l1)
        self.sum2=sum(l2)
        self.chance1=[]
        self.chance2=[]
        for each in l1:
            self.chance1.append(float(each) / self.sum1)
        for each in l2:
            self.chance2.append(float(each) / self.sum2)
        count1=0
        for eachkey in self.charlist1_chance.keys():
            self.charlist1_chance[eachkey] = self.chance1[count1]
            count1+=1
            if count1==len(self.chance1)-1:
                break

        count2=0
        for eachkey in self.charlist2_chance.keys():
            self.charlist2_chance[eachkey] = self.chance2[count2]
            count2 += 1
            if count2==len(self.chance2)-1:
                break


    def ran(self,cipai,dict_cipai):
        word_nums=dict_cipai[cipai]
        words = []
        sentences=[]
        for eachnum in word_nums:
            i=0
            if eachnum==0:
                break
            words.clear()
            while i<eachnum:
                if i==0:
                    word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
                    words.append(word[0])
                    i=i+1
                if eachnum-i>1:

                    word = np.random.choice(self.key_n2List, 1, replace=False,p=self.chance2)
                    words.append(word[0])
                    i+=2
                # words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
                if i==eachnum:
                    break
                word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
                # word = np.random.choice(self.key_n1List, 1, replace=False)
                words.append(word[0])
                i+=1
            sentences.append(''.join(words))
        return sentences

class BGram:

    def __init__(self):
        file1 = open('char_result_n1.txt', 'r', encoding='gbk')
        file2 = open('char_result_n2.txt', 'r', encoding='gbk')
        charlist1 = {}
        charlist2 = {}
        self.charlist1_chance = {}
        self.charlist2_chance = {}
        for each in file1:
            charlist1[each[:1]] = each[2:-1]
            self.charlist1_chance[each[:1]]=0
        for each in file2:
            charlist2[each[:2]] = each[3:-1]
            self.charlist2_chance[each[:2]] = 0
        self.key_n1List=list(charlist1.keys())
        self.key_n2List = list(charlist2.keys())
        num1List=list(charlist1.values())
        l1=[]
        num2List = list(charlist2.values())
        l2=[]

        for each in num1List:
            if each == '':
                l1.append(0)
                continue
            l1.append(int(each))
        for each in num2List:
            if each == '':
                l2.append(0)
                continue
            l2.append(int(each))
        self.sum1=sum(l1)
        self.sum2=sum(l2)
        self.chance1=[]
        self.chance2=[]
        for each in l1:
            self.chance1.append(float(each) / self.sum1)
        for each in l2:
            self.chance2.append(float(each) / self.sum2)
        count1=0
        for eachkey in self.charlist1_chance.keys():
            self.charlist1_chance[eachkey] = self.chance1[count1]
            count1+=1
            if count1==len(self.chance1)-1:
                break
            # for eachval in l1:
            #     self.charlist1_chance[eachkey]=eachval
        count2=0
        for eachkey in self.charlist2_chance.keys():
            self.charlist2_chance[eachkey] = self.chance2[count2]
            count2 += 1
            if count2==len(self.chance2)-1:
                break

    def B_gram(self,cipai,dict_cipai):
        word_nums=dict_cipai[cipai]
        words = []
        sentences=[]
        words_maybe=[]
        chance_maybe=[]
        newchance_maybe={}
        for eachnum in word_nums:
            i=0
            if eachnum==0:
                break
            words.clear()
            words_maybe.clear()
            chance_maybe.clear()
            newchance_maybe.clear()
            while i<eachnum:
                if i==0:
                    word=np.random.choice(self.key_n1List, 1, replace=False, p=self.chance1)
                    words.append(word[0])
                    i=i+1
                if eachnum-i>1:
                    for each_2word in self.key_n2List:
                        if words[-1] == each_2word[0]:
                            words_maybe.append(each_2word)
                            chance_maybe.append(self.charlist2_chance[each_2word])
                            newchance_maybe[each_2word]=self.charlist2_chance[each_2word]

                    if words_maybe!=None:
                        # word=np.random.choice(words_maybe,1,replace=False,p=newchance_maybe)
                        list=sorted(newchance_maybe.items(),key=lambda x:x[1],reverse=True)
                        word = list[0]
                        words.append(word[0][1])
                        i+=1
                # words.append(np.random.choice(self.key_n1List,1,replace=False,p=self.chance1))
                if i==eachnum:
                    break
                word=np.random.choice(self.key_n1List, 1, replace=False,p=self.chance1)
                # word = np.random.choice(self.key_n1List, 1, replace=False)
                words.append(word[0])
                i+=1
            sentences.append(''.join(words))
        return sentences


gui=GUI()