项目三:《最大概率匹配》分词

一、数据介绍

  • RenMinData.txt
    已经分好词的人民日报数据
    在这里插入图片描述

二、代码介绍

CreateLexicon.py:tocken转化为id,生成两个文件:RenMinData.id.txt和WordDic.rm.txt。

"""
    tocken转化为id
        1. 构建word-id映射字典
        2. 构造id文件
        3. 构造word-id文件
"""
import sys

rawDataFlie = "../data/RenMinData.txt"
idDataFile = "../data/RenMinData.id.txt"
wordDictFile = "../data/WordDic.rm.txt"

id = 1
wordIdDict = {}

# 构建wordIdDict映射字典
inputFile = open(rawDataFlie, 'r', encoding='utf-8')
s = inputFile.readline().strip()


while len(s) > 0:
    for word in s.split(' '):
        if word not in wordIdDict:
            wordIdDict[word] = id
            id += 1  # 生成id---累加
    s = inputFile.readline().strip()
inputFile.close()
print("Reading raw data file finished!")
print("Total number of words:", len(wordIdDict))

# 生成id文件
inputFile = open(rawDataFlie, 'r', encoding='utf-8')  # 重新读取,使得指针指向开头
outputFile = open(idDataFile, 'w')
s = inputFile.readline().strip()
while len(s) > 0:
    words = s.split(' ')
    for i in range(len(words) - 1):
        if words[i] not in wordIdDict:
            print("OOV word found!")
        # 否则,将对应的单词编号写到id文件中
        else:
            outputFile.write(str(wordIdDict[words[i]]))
            outputFile.write(' ')
    word = words[len(words) - 1]
    if word not in wordIdDict:
        print("OOV word found!")
    else:
        outputFile.write(str(wordIdDict[word]))
    # 一句话处理完后换行
    outputFile.write("\n")
    s = inputFile.readline().strip()
inputFile.close()
outputFile.close()
print("write id data file finished!")


# 构造word-id文件
outputFile = open(wordDictFile, 'w', encoding='utf-8')

for word in wordIdDict.keys():
    # outputFile.write(word.encode('gb2312'))
    outputFile.write(word)
    outputFile.write(' ')
    outputFile.write(str(wordIdDict[word]))
    outputFile.write('\n')

outputFile.close()
print('write word id table file finished')
Reading raw data file finished!
Total number of words: 25005
write id data file finished!
write word id table file finished

BiLMTrain.py:训练二元语言模型,生成二元语言模型文件:BiModel.rm.txt

"""
    训练二元语言模型:
        1.将word-id文件内容加载到字典中
        2. Unigram与Bigram计数
        3. 计算二元语言模型概率
"""
import sys

idDataFile = "../data/RenMinData.id.txt"
wordDictFile = "../data/WordDic.rm.txt"
biModelFile = "../data/BiModel.rm.txt"

wordIdDict = {}  # word-id对
BigramTableList = []  # 二元概率
UnigramCountList = []  # 一元概率
SmoothedProbList = []  # 平滑概率
TotalNum = 0  # 总词数(不去重)

# 1.将word-id文件内容加载到字典中
inputFile = open(wordDictFile, 'r', encoding='utf-8')
s = inputFile.readline().strip()
while len(s) > 0:
    words = s.split(' ')
    if words[0] not in wordIdDict:
        wordIdDict[words[0]] = int(words[1])
    s = inputFile.readline().strip()
inputFile.close()
print('Reading word id file finished!')
print('Total number of words:', len(wordIdDict))

# 2.Unigram与Bigram计数
for i in range(len(wordIdDict) + 1):
    BigramTableList.append({})
    UnigramCountList.append(0)
    SmoothedProbList.append(0)

inputFile = open(idDataFile, 'r', encoding='utf-8')
s = inputFile.readline().strip()

while len(s) > 0:
    wordList = []
    words = s.split(' ')
    TotalNum += len(words)
    for word in words:
        wordList.append(int(word))
    # 统计Unigram
    for word in wordList:
        UnigramCountList[word] += 1
    # 统计Bigram
    for i in range(len(wordList) - 1):
        tmpDict = BigramTableList[wordList[i]]
        if wordList[i + 1] not in tmpDict:
            tmpDict[wordList[i + 1]] = 1
        else:
            tmpDict[wordList[i + 1]] += 1
    # 迭代
    s = inputFile.readline().strip()
inputFile.close()
print("Reading id data file finished!")

# 3.计算二元语言模型概率
for wid in range(1, len(wordIdDict) + 1):
    SmoothedProbList[wid] = 1 / float(UnigramCountList[wid] + len(wordIdDict))
    ht = BigramTableList[wid]
    for wid1 in ht.keys():
        ht[wid1] = float(ht[wid1] + 1) / float(UnigramCountList[wid] + len(wordIdDict))
    UnigramCountList[wid] = float(UnigramCountList[wid]) / float(TotalNum)

# 4.保存计算结果
outputFile = open(biModelFile, 'w')
outputFile.write(str(len(wordIdDict)) + " " + str(TotalNum) + '\n')
for wid1 in range(1, len(wordIdDict) + 1):
    outputFile.write(str(UnigramCountList[wid1]) + ' ')
    outputFile.write(str(SmoothedProbList[wid1]))
    ht = BigramTableList[wid1]
    for wid2 in ht.keys():
        outputFile.write(" " + str(wid2) + " " + str(ht[wid2]))
    outputFile.write('\n')
outputFile.close()
print('Writing model file finished!')
Reading word id file finished!
Total number of words: 25005
Reading id data file finished!
Writing model file finished!

ViterbiCWS.py:

"""
    动态规划--维比特算法
"""

import sys
import math


class Node:
    def __init__(self, word):
        self.bestScore = 0.0
        self.bestPreNode = None
        self.len = len(word)
        self.word = word


class BiLM:
    def __init__(self, lexiconFile, biLMFile):
        """
        初始化并载入单词-id映射字典、一元语言模型概率、二元语言模型概率
        :param lexiconFile: 字典:单词-id映射文件
        :param biLMFile: 二元语言模型
        """
        self.wordNum = 0
        self.wordIDTable = {}
        self.unigramProb = []   # 一元语言模型概率
        self.bigramProb = []    # 二元语言模型概率
        self.unknownWordProb = 1.0

        # 读取单词-id映射文件
        infile = open(lexiconFile, 'r', encoding='utf-8')
        sline = infile.readline().strip()
        self.maxWordLen = 1
        while len(sline) > 0:
            # sline = sline.decode("gb2312")
            items = sline.split(' ')
            if len(items) != 2:
                print("Lexicon format error!")
                sline = infile.readline().strip()
                continue
            self.wordIDTable[items[0]] = int(items[1])
            if len(items[0]) > self.maxWordLen:
                self.maxWordLen = len(items[0])
            sline = infile.readline().strip()
        infile.close()

        # 读取二元语言模型文件;每行分别是:每行索引作为单词id对应的一元语言模型概率;平滑概率;二元语言模型对应id及其对应二元语言模型的概率
        infile = open(biLMFile, 'r')
        sline = infile.readline().strip()
        items = sline.split(' ')
        if len(items) == 2:
            self.wordNum = int(items[0])
        else:
            print("Bad format found in LM file!")
            sys.exit()
        sline = infile.readline().strip()
        # 构建一元与二元语言模型的存储格式
        for i in range(len(self.wordIDTable)):
            self.unigramProb.append(0.0)
            self.bigramProb.append({})
        self.unigramProb.append(0.0)
        self.bigramProb.append({})
        wid = 1
        while len(sline) > 0:
            items = sline.split(' ')
            self.unigramProb[wid] = float(items[1])
            i = 2
            while i < len(items):
                self.bigramProb[wid][int(items[i])] = float(items[i + 1])
                i += 2
            sline = infile.readline().strip()
            wid += 1
        infile.close()
        print(len(self.wordIDTable), "words loaded")

    def GetScore(self, word1, word2):
        """
        计算该步骤的分数
        :param word1: 前一步的单词
        :param word2: 后一步的单词
        :return: 如果两个单词都在词典中,则返回Bigram的概率
        """
        wid1 = -1
        wid2 = -1
        # 这个单词是未登录词
        if word1 not in self.wordIDTable:
            return self.unknownWordProb
        wid1 = self.wordIDTable[word1]
        # 这个单词后一步单词是未登录词
        if word2 not in self.wordIDTable:
            return self.unigramProb[wid1]
        wid2 = self.wordIDTable[word2]
        # 如果在二元语言模型中,这两个单词不是前后关系
        if wid2 not in self.bigramProb[wid1]:
            return self.unigramProb[wid1]
        # 如果两个单词都在词典中且为前后关系,则返回Bigram的概率
        return self.bigramProb[wid1][wid2]


def CreateGraph(s):
    """
    构建有向无环图(DAG词图)
    :param s:句子
    :return:
    """
    WordGraph = []
    # 在一个句子首尾部分用标记做锚定,并且为真实文字留有空间,每个索引背后都是列表,列表中存着当前时刻的几种选择
    # Start Node
    newNode = Node("")
    newNodeList = []
    newNodeList.append(newNode)
    WordGraph.append(newNodeList)

    for i in range(len(s)):
        WordGraph.append([])

    # End Node
    newNode = Node(" ")
    newNodeList = []
    newNodeList.append(newNode)
    WordGraph.append(newNodeList)

    # Other nodes
    for i in range(len(s)):
        j = myBiLM.maxWordLen   # 最大单词长度
        if i + j > len(s):
            j = len(s) - i
        while j > 0:
            # 判断当前词是否在字典中,如果在的话,则将这个词追加到wordGraph对应索引的列表中
            if s[i:i + j] in myBiLM.wordIDTable:
                newNode = Node(s[i:i + j])
                WordGraph[i + j].append(newNode)
            j -= 1
        if len(WordGraph[i + 1]) < 1:
            print("Unknown character found!", i, s[i])
            sys.exit()
    return WordGraph


def ViterbiSearch(WordGraph):
    """
    动态规划---维特比算法
    :param WordGraph: DAG有环无环图,以列表形式显示
    :return:
    """
    for i in range(len(WordGraph) - 1):
        # 遍历每一步的所有选择
        for curNode in WordGraph[i + 1]:
            # WordGraph中前一步对应的索引
            preLevel = i + 1 - curNode.len
            if preLevel < 0:
                print("running error!")
                sys.exit()
            # 得到前一步的节点
            preNode = WordGraph[preLevel][0]
            # 计算前后两个词的分数
            score = myBiLM.GetScore(preNode.word, curNode.word)
            score = preNode.bestScore + math.log(score)
            maxScore = score
            curNode.bestPreNode = preNode
            for j in range(1, len(WordGraph[preLevel])):
                preNode = WordGraph[preLevel][j]
                score = myBiLM.GetScore(preNode.word, curNode.word)
                score = preNode.bestScore + math.log(score)
                if score > maxScore:
                    curNode.bestScore = score
                    curNode.bestPreNode = preNode



def BackSearch(WordGraph):
    """
    基于有向无环图进行回溯
    :param WordGraph:
    :return:
    """
    resultList = []
    curNode = WordGraph[len(WordGraph) - 1][0].bestPreNode
    while curNode.bestPreNode != None:
        resultList.insert(0, curNode.word)
        curNode = curNode.bestPreNode
    return resultList


LexiconFile = "../data/WordDic.rm.txt"
BiLMFile = "../data/BiModel.rm.txt"
myBiLM = BiLM(LexiconFile, BiLMFile)

inputStr = u"南京市长江大桥"

# 构建有向无环图(DAG词图)
WordGraph = CreateGraph(inputStr)

for NodeList in WordGraph:
    for Node in NodeList:
        print("CurNode Word: ", Node.word)

# 维比特算法寻优
ViterbiSearch(WordGraph)

# 回溯
resultList = BackSearch(WordGraph)
for word in resultList:
    print(word, )
25005 words loaded
CurNode Word:  
CurNode Word:  南
CurNode Word:  南京
CurNode Word:  京
CurNode Word:  南京市
CurNode Word:  市
CurNode Word:  市长
CurNode Word:  长
CurNode Word:  长江
CurNode Word:  江
CurNode Word:  大
CurNode Word:  大桥
CurNode Word:  桥
CurNode Word:   
南京市
长江
大桥
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

幼稚的人呐

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值