涉及到了对整段文字的机器学习,通过学习根据词频计算每个单词的先验概率,在输入一个非正确的单词后选择编辑距离最近先验概率最高的词作为结果。
import re,collections
def words(text):#取出学习样本中的词
return re.findall('[a-z]+',text.lower())
def train(features):
model = collections.defaultdict(lambda:1)
for f in features:
model[f] += 1
return model
NW = train(words(open('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):#列举编辑距离为一的单词集合
n = len(word)
return set([word[0:i]+word[i+1:] for i in range(n)] +
[word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] +
[word[0:i]+c+word[i+1:] for i in range(n) for c in alphabet] +
[word[0:i]+c+word[i:] for i in range(n+1) for c in alphabet])
def edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NW)#编辑距离为2,只包含在学习集中的正确单词
def known(words):
return set (w for w in words if w in NW)
def correct(word):
candidates = known([word]) or known(edits1(word)) or known(edits2(word))
return max(candidates,key = lambda w: NW[w])#输出先验概率最高的词
correct(raw_input('input'))