过程不累赘,用的是动态规划的方法。有空再写,直接贴代码。
代码有些地方可以继续优化。如果有不合理之处,请多多指教。
可供参考,禁止抄袭,大家一起学习,一起进步
import copy
class Model:
def __init__(self,sentence):
self.sentence=list(sentence)
self.state=['B','M','E','S']
self.count_one=[x for x in range(len(self.sentence))] #统计词语出现的次数
self.fp = open("./data/msr_training.txt", 'r', encoding="UTF-8")
self.word_P=[[0,0,0,0] for x in range(len(self.sentence))]
self.word_P_fuben=None
self.Dist_1=None #所有字前后概率
self.Dist_2=None
# 单个词作为4种状态的数量和概率
def statisticsP(self):
self.fp.seek(0)
lines = self.fp.readlines()
# 单个词的概率
for line in lines:
line = line.strip().split("|")
if (line[0] in self.sentence):
if line[1] in self.state:
x1 = self.sentence.index(line[0])
x2 = self.state.index(line[1])
self.word_P[x1][x2] += 1
self.count_one[x1] += 1
self.word_P_fuben = copy.deepcopy(self.word_P)
for x in self.word_P:
temp = sum(x)
for xx in range(len(x)):
x[xx] = x[xx] / (temp + 0.1)
temp = 0
def statisticsW(self):
word_W = [[[0, 0] for x in range(4)] for x in range(len(self.sentence))]
self.Dist_1 = dict(zip(self.sentence, word_W))
self.fp.seek(0)
w1 = None
w2 = None
w3 = None
lines = self.fp.readlines()
# 前后概率
for j in range(len(lines) - 1):
line = lines[j].strip().split("|")
if len(line) < 2: continue
if (line[0] in self.sentence):
x1 = self.state.index(line[1])
i = self.sentence.index(line[0])
if i == 0:
w1 = None
w2 = self.sentence[i]
w3 = self.sentence[i + 1]
elif i == len(self.sentence) - 1:
w1 = self.sentence[i - 1]
w2 = self.sentence[i]
w3 = None
else:
w1 = self.sentence[i - 1]
w2 = self.sentence[i]
w3 = self.sentence[i + 1]
# 看前
line_font = lines[j - 1].strip().split("|")
x2 = self.state.index(line[1])
if line_font[0] == w1 and line_font[1]==x2:
# x2 = self.state.index(line_font[1])
self.Dist_1[w2][x2][0] += 1
# # 看后
line_font_1 = lines[j + 1].strip().split("|")
if line_font_1[0] == w3 and line_font_1[1]==x2:
# x3 = self.state.index(line_font_1[1])
self.Dist_1[w2][x2][1] += 1
k = 0
for x in self.Dist_1:
for i in range(4):
for j in range(2):
self.Dist_1[x][i][j] = self.Dist_1[x][i][j] / (self.count_one[k])
k += 1
def statisticsneststate(self):
self.fp.seek(0)
word_neststate = [[[0, 0, 0, 0] for x in range(4)] for x in range(len(self.sentence))]
self.Dist_2 = dict(zip(self.sentence, word_neststate))
Dist_2_fuben = None
lines = self.fp.readlines()
for j in range(len(lines) - 1):
line = lines[j].strip().split('|')
if line[0] in self.sentence:
line_next = lines[j +1].strip().split('|')
x1 = self.state.index(line[1])
if line_next[1] in self.state:
# print(line_next[1])
x2 = self.state.index(line_next[1])
self.Dist_2[line[0]][x1][x2] += 1
Dist_2_fuben = copy.deepcopy(self.Dist_2)
k=0
for x in self.Dist_2:
for i in range(4):
for j in range(4):
self.Dist_2[x][i][j] = self.Dist_2[x][i][j] / (self.count_one[k])
k+=1
def Cutsentence(self):
print("正在训练.....")
self.statisticsP()
self.statisticsW()
self.statisticsneststate()
self.fp.close()
word_matrix = [[0 for i in range(0, 4)] for j in range(len(self.sentence))]
word_path = []
word_matrix[0][0] = self.word_P[0][0] + sum(self.Dist_1[self.sentence[0]][0])
word_matrix[0][1] = self.word_P[0][1] + sum(self.Dist_1[self.sentence[0]][1])
word_matrix[0][2] = self.word_P[0][2] + sum(self.Dist_1[self.sentence[0]][2])
word_matrix[0][3] = self.word_P[0][3] + sum(self.Dist_1[self.sentence[0]][3])
word_path.append([0, 0, 0, 0])
for i in range(1, len(self.sentence)):
Stemp_2 = []
for j in range(4):
Stemp_1 = []
for z in range(4):
M1 = self.Dist_2[self.sentence[i-1]][z][j] * word_matrix[i - 1][z] + sum(self.Dist_1[self.sentence[i]][j]) + self.word_P[i][j]
Stemp_1.append(M1)
word_matrix[i][j] = max(Stemp_1)
Stemp_2.append(Stemp_1.index(max(Stemp_1)))
word_path.append(Stemp_2)
max_index = word_matrix[len(self.sentence) - 1].index(max(word_matrix[len(self.sentence) - 1]))
res_path = []
for i in range(len(word_path) - 1, -1, -1):
res_path.append(self.state[max_index])
max_index = word_path[i][max_index]
res_path.reverse()
print("分词结果:")
for i in range(len(res_path)):
print(self.sentence[i], "/", res_path[i], end=" ")
#调用
import Cutwordmodel
model=Cutwordmodel.Model("中国的经济发展较快速")
model.Cutsentence()