import numpy as np
#词典库
vocab=set([line.rstrip() for line in open('./vocab.txt ')])
# 生成所有候选集
def generate_candidates(word):
""""
word:给定的输入(错误的输入)
返回所有(valid)候选集
"""
# 生成编辑距离为1的单词,方法有:1.insert 2. delete 3.replace
# 若生成编辑距离为2的加个循环就行了,此处省略
letters = 'abcdefghijklmnopqrstuvwxyz'
splits= [(word[:i],word[i:]) for i in range(len(word)+1)]
# insert 操作
inserts=[L+c+R for L,R in splits for c in letters]
# delete 操作
deletes=[L+R[1:] for L,R in splits if R]
# replace 操作
replaces=[L+c+R[1:] for L,R in splits if R for c in letters]
candidates=set(inserts+deletes+replaces)
return [word for word in candidates if word in vocab]
from nltk.corpus import reuters
#读取语料库,用于构建语言模型
categories=reuters.categories()
corpus=reuters.sents(categories=categories)
#构建语言模型:Bigram
term_count={}
bigram_count={}
for doc in corpus:
doc = ['<s>'] + doc #起始处加上<s>
for i in range(0,len(doc)-1):
# bigram:[i,i+1]
term=doc[i]
bigram=doc[i:i+2]
if term in term_count:
term_count[term] += 1
else:
term_count[term] = 1
bigram=' '.join(bigram)
if bigram in bigram_count:
bigram_count[bigram] += 1
else:
bigram_count[bigram] = 1
# 用户打错的概率统计
channel_prob={}
for line in open('./spell-errors.txt '):
items=line.split(':')
correct = items[0].strip()
mistakes = [item.strip() for item in items[1].strip().split(',')]
channel_prob[correct]={}
for mis in mistakes:
channel_prob[correct][mis]=1.0/len(mistakes)
V=len(term_count.keys())
file = open('./testdata.txt','r')
for line in file:
items=line.rstrip().split('\t')
line = items[2].split()
for word in line:
if word not in vocab:
candidates = generate_candidates(word)
if len(candidates) < 1:
continue
probs = []
for candi in candidates:
prob=0
# a.计算channel probability p(s|c)
if candi in channel_prob and word in channel_prob[candi]:
prob +=np.log(channel_prob[candi][word])
else:
prob +=np.log(0.00001)
# b .计算语言模型的概率 p(c)
idx = items[2].index(word)+1
if items[2][idx-1] in bigram_count and candi in bigram_count[items[2][idx-1]]:
prob += np.log((bigram_count[items[2][idx-1][candi]] + 1.0 )/(
term_count[bigram_count[items[2][idx-1]]] + V))
else:
prob += np.log(1.0/V )
probs.append(prob)
max_idx = probs.index(max(probs))
print(word,candidates[max_idx])
拼写纠错
最新推荐文章于 2022-01-10 18:20:27 发布