一、数据介绍
- RenMinData.txt
已经分好词的人民日报数据
二、代码介绍
CreateLexicon.py
:tocken转化为id,生成两个文件:RenMinData.id.txt和WordDic.rm.txt。
"""
tocken转化为id
1. 构建word-id映射字典
2. 构造id文件
3. 构造word-id文件
"""
import sys
rawDataFlie = "../data/RenMinData.txt"
idDataFile = "../data/RenMinData.id.txt"
wordDictFile = "../data/WordDic.rm.txt"
id = 1
wordIdDict = {}
# 构建wordIdDict映射字典
inputFile = open(rawDataFlie, 'r', encoding='utf-8')
s = inputFile.readline().strip()
while len(s) > 0:
for word in s.split(' '):
if word not in wordIdDict:
wordIdDict[word] = id
id += 1 # 生成id---累加
s = inputFile.readline().strip()
inputFile.close()
print("Reading raw data file finished!")
print("Total number of words:", len(wordIdDict))
# 生成id文件
inputFile = open(rawDataFlie, 'r', encoding='utf-8') # 重新读取,使得指针指向开头
outputFile = open(idDataFile, 'w')
s = inputFile.readline().strip()
while len(s) > 0:
words = s.split(' ')
for i in range(len(words) - 1):
if words[i] not in wordIdDict:
print("OOV word found!")
# 否则,将对应的单词编号写到id文件中
else:
outputFile.write(str(wordIdDict[words[i]]))
outputFile.write(' ')
word = words[len(words) - 1]
if word not in wordIdDict:
print("OOV word found!")
else:
outputFile.write(str(wordIdDict[word]))
# 一句话处理完后换行
outputFile.write("\n")
s = inputFile.readline().strip()
inputFile.close()
outputFile.close()
print("write id data file finished!")
# 构造word-id文件
outputFile = open(wordDictFile, 'w', encoding='utf-8')
for word in wordIdDict.keys():
# outputFile.write(word.encode('gb2312'))
outputFile.write(word)
outputFile.write(' ')
outputFile.write(str(wordIdDict[word]))
outputFile.write('\n')
outputFile.close()
print('write word id table file finished')
Reading raw data file finished!
Total number of words: 25005
write id data file finished!
write word id table file finished
BiLMTrain.py
:训练二元语言模型,生成二元语言模型文件:BiModel.rm.txt
"""
训练二元语言模型:
1.将word-id文件内容加载到字典中
2. Unigram与Bigram计数
3. 计算二元语言模型概率
"""
import sys
idDataFile = "../data/RenMinData.id.txt"
wordDictFile = "../data/WordDic.rm.txt"
biModelFile = "../data/BiModel.rm.txt"
wordIdDict = {} # word-id对
BigramTableList = [] # 二元概率
UnigramCountList = [] # 一元概率
SmoothedProbList = [] # 平滑概率
TotalNum = 0 # 总词数(不去重)
# 1.将word-id文件内容加载到字典中
inputFile = open(wordDictFile, 'r', encoding='utf-8')
s = inputFile.readline().strip()
while len(s) > 0:
words = s.split(' ')
if words[0] not in wordIdDict:
wordIdDict[words[0]] = int(words[1])
s = inputFile.readline().strip()
inputFile.close()
print('Reading word id file finished!')
print('Total number of words:', len(wordIdDict))
# 2.Unigram与Bigram计数
for i in range(len(wordIdDict) + 1):
BigramTableList.append({})
UnigramCountList.append(0)
SmoothedProbList.append(0)
inputFile = open(idDataFile, 'r', encoding='utf-8')
s = inputFile.readline().strip()
while len(s) > 0:
wordList = []
words = s.split(' ')
TotalNum += len(words)
for word in words:
wordList.append(int(word))
# 统计Unigram
for word in wordList:
UnigramCountList[word] += 1
# 统计Bigram
for i in range(len(wordList) - 1):
tmpDict = BigramTableList[wordList[i]]
if wordList[i + 1] not in tmpDict:
tmpDict[wordList[i + 1]] = 1
else:
tmpDict[wordList[i + 1]] += 1
# 迭代
s = inputFile.readline().strip()
inputFile.close()
print("Reading id data file finished!")
# 3.计算二元语言模型概率
for wid in range(1, len(wordIdDict) + 1):
SmoothedProbList[wid] = 1 / float(UnigramCountList[wid] + len(wordIdDict))
ht = BigramTableList[wid]
for wid1 in ht.keys():
ht[wid1] = float(ht[wid1] + 1) / float(UnigramCountList[wid] + len(wordIdDict))
UnigramCountList[wid] = float(UnigramCountList[wid]) / float(TotalNum)
# 4.保存计算结果
outputFile = open(biModelFile, 'w')
outputFile.write(str(len(wordIdDict)) + " " + str(TotalNum) + '\n')
for wid1 in range(1, len(wordIdDict) + 1):
outputFile.write(str(UnigramCountList[wid1]) + ' ')
outputFile.write(str(SmoothedProbList[wid1]))
ht = BigramTableList[wid1]
for wid2 in ht.keys():
outputFile.write(" " + str(wid2) + " " + str(ht[wid2]))
outputFile.write('\n')
outputFile.close()
print('Writing model file finished!')
Reading word id file finished!
Total number of words: 25005
Reading id data file finished!
Writing model file finished!
ViterbiCWS.py
:
"""
动态规划--维比特算法
"""
import sys
import math
class Node:
def __init__(self, word):
self.bestScore = 0.0
self.bestPreNode = None
self.len = len(word)
self.word = word
class BiLM:
def __init__(self, lexiconFile, biLMFile):
"""
初始化并载入单词-id映射字典、一元语言模型概率、二元语言模型概率
:param lexiconFile: 字典:单词-id映射文件
:param biLMFile: 二元语言模型
"""
self.wordNum = 0
self.wordIDTable = {}
self.unigramProb = [] # 一元语言模型概率
self.bigramProb = [] # 二元语言模型概率
self.unknownWordProb = 1.0
# 读取单词-id映射文件
infile = open(lexiconFile, 'r', encoding='utf-8')
sline = infile.readline().strip()
self.maxWordLen = 1
while len(sline) > 0:
# sline = sline.decode("gb2312")
items = sline.split(' ')
if len(items) != 2:
print("Lexicon format error!")
sline = infile.readline().strip()
continue
self.wordIDTable[items[0]] = int(items[1])
if len(items[0]) > self.maxWordLen:
self.maxWordLen = len(items[0])
sline = infile.readline().strip()
infile.close()
# 读取二元语言模型文件;每行分别是:每行索引作为单词id对应的一元语言模型概率;平滑概率;二元语言模型对应id及其对应二元语言模型的概率
infile = open(biLMFile, 'r')
sline = infile.readline().strip()
items = sline.split(' ')
if len(items) == 2:
self.wordNum = int(items[0])
else:
print("Bad format found in LM file!")
sys.exit()
sline = infile.readline().strip()
# 构建一元与二元语言模型的存储格式
for i in range(len(self.wordIDTable)):
self.unigramProb.append(0.0)
self.bigramProb.append({})
self.unigramProb.append(0.0)
self.bigramProb.append({})
wid = 1
while len(sline) > 0:
items = sline.split(' ')
self.unigramProb[wid] = float(items[1])
i = 2
while i < len(items):
self.bigramProb[wid][int(items[i])] = float(items[i + 1])
i += 2
sline = infile.readline().strip()
wid += 1
infile.close()
print(len(self.wordIDTable), "words loaded")
def GetScore(self, word1, word2):
"""
计算该步骤的分数
:param word1: 前一步的单词
:param word2: 后一步的单词
:return: 如果两个单词都在词典中,则返回Bigram的概率
"""
wid1 = -1
wid2 = -1
# 这个单词是未登录词
if word1 not in self.wordIDTable:
return self.unknownWordProb
wid1 = self.wordIDTable[word1]
# 这个单词后一步单词是未登录词
if word2 not in self.wordIDTable:
return self.unigramProb[wid1]
wid2 = self.wordIDTable[word2]
# 如果在二元语言模型中,这两个单词不是前后关系
if wid2 not in self.bigramProb[wid1]:
return self.unigramProb[wid1]
# 如果两个单词都在词典中且为前后关系,则返回Bigram的概率
return self.bigramProb[wid1][wid2]
def CreateGraph(s):
"""
构建有向无环图(DAG词图)
:param s:句子
:return:
"""
WordGraph = []
# 在一个句子首尾部分用标记做锚定,并且为真实文字留有空间,每个索引背后都是列表,列表中存着当前时刻的几种选择
# Start Node
newNode = Node("")
newNodeList = []
newNodeList.append(newNode)
WordGraph.append(newNodeList)
for i in range(len(s)):
WordGraph.append([])
# End Node
newNode = Node(" ")
newNodeList = []
newNodeList.append(newNode)
WordGraph.append(newNodeList)
# Other nodes
for i in range(len(s)):
j = myBiLM.maxWordLen # 最大单词长度
if i + j > len(s):
j = len(s) - i
while j > 0:
# 判断当前词是否在字典中,如果在的话,则将这个词追加到wordGraph对应索引的列表中
if s[i:i + j] in myBiLM.wordIDTable:
newNode = Node(s[i:i + j])
WordGraph[i + j].append(newNode)
j -= 1
if len(WordGraph[i + 1]) < 1:
print("Unknown character found!", i, s[i])
sys.exit()
return WordGraph
def ViterbiSearch(WordGraph):
"""
动态规划---维特比算法
:param WordGraph: DAG有环无环图,以列表形式显示
:return:
"""
for i in range(len(WordGraph) - 1):
# 遍历每一步的所有选择
for curNode in WordGraph[i + 1]:
# WordGraph中前一步对应的索引
preLevel = i + 1 - curNode.len
if preLevel < 0:
print("running error!")
sys.exit()
# 得到前一步的节点
preNode = WordGraph[preLevel][0]
# 计算前后两个词的分数
score = myBiLM.GetScore(preNode.word, curNode.word)
score = preNode.bestScore + math.log(score)
maxScore = score
curNode.bestPreNode = preNode
for j in range(1, len(WordGraph[preLevel])):
preNode = WordGraph[preLevel][j]
score = myBiLM.GetScore(preNode.word, curNode.word)
score = preNode.bestScore + math.log(score)
if score > maxScore:
curNode.bestScore = score
curNode.bestPreNode = preNode
def BackSearch(WordGraph):
"""
基于有向无环图进行回溯
:param WordGraph:
:return:
"""
resultList = []
curNode = WordGraph[len(WordGraph) - 1][0].bestPreNode
while curNode.bestPreNode != None:
resultList.insert(0, curNode.word)
curNode = curNode.bestPreNode
return resultList
LexiconFile = "../data/WordDic.rm.txt"
BiLMFile = "../data/BiModel.rm.txt"
myBiLM = BiLM(LexiconFile, BiLMFile)
inputStr = u"南京市长江大桥"
# 构建有向无环图(DAG词图)
WordGraph = CreateGraph(inputStr)
for NodeList in WordGraph:
for Node in NodeList:
print("CurNode Word: ", Node.word)
# 维比特算法寻优
ViterbiSearch(WordGraph)
# 回溯
resultList = BackSearch(WordGraph)
for word in resultList:
print(word, )
25005 words loaded
CurNode Word:
CurNode Word: 南
CurNode Word: 南京
CurNode Word: 京
CurNode Word: 南京市
CurNode Word: 市
CurNode Word: 市长
CurNode Word: 长
CurNode Word: 长江
CurNode Word: 江
CurNode Word: 大
CurNode Word: 大桥
CurNode Word: 桥
CurNode Word:
南京市
长江
大桥