一、递归的方法
import xlrd
import math
dw = xlrd.open_workbook('中文词库.xlsx')
dic_words = []
rows = dw.sheet_by_index(0).get_rows()
for row in rows:
dic_words.append(row[0].value)
print(dic_words)
word_prob = {
"北京": 0.03,
"的": 0.08,
"天": 0.005,
"气": 0.005,
"天气": 0.06,
"真": 0.04,
"好": 0.05,
"真好": 0.04,
"啊": 0.01,
"真好啊": 0.02,
"今": 0.01,
"今天": 0.07,
"课程": 0.06,
"内容": 0.06,
"有": 0.05,
"很": 0.03,
"很有": 0.04,
"意思": 0.06,
"有意思": 0.005,
"课": 0.01,
"程": 0.005,
"经常": 0.08,
"意见": 0.08,
"意": 0.01,
"见": 0.005,
"有意见": 0.02,
"分歧": 0.04,
"分": 0.02,
"歧": 0.005}
print(sum(word_prob.values()))
for i in word_prob.keys():
word_prob[i] = round(-math.log(word_prob[i]), 2)
def word_segment_naive(input_str, words):
if input_str == "":
return [[]]
else:
result = []
for i in range(1, len(input_str) + 1):
if input_str[:i] in words:
for remain_segment in word_segment_naive(input_str[i:], words):
result.append([input_str[:i]] + remain_segment)
return result
segments = word_segment_naive("北京的天气真好啊", dic_words)
best_segment = []
best_score = math.inf
for seg in segments:
score = 0
for word in seg:
if word in word_prob.keys():
score += word_prob[word]
else:
score += round(-math.log(0.00001), 2)
if score < best_score:
best_score = score
best_segment = seg
print(best_segment)