import re, collections
defwords(text):return re.findall('[a-z]+', text.lower())#去除特殊符号,只留下英文字符,并且转化为小写,只统计连着的英语字母形成的单词deftrain(features):#设置初值为1, 令那些没有出现过的词汇的先验概率不为0, 如果是0的话,整个概率就一定为0,太绝对了
model = collections.defaultdict(lambda:1)for f in features:
model[f]+=1#统计所有的词汇的统计数量return model
NWORDS = train(words(open('big.txt').read()))
alphabet ='abcdefghijklmnopqrstuvwxyz'defedits1(word):#返回编辑距离为1的集合,就是只变动一个字母就正确的单词
n =len(word)returnset([word[0:i]+word[i+1:]for i inrange(n)]+# deletion[word[0:i]+word[i+1]+word[i]+word[i+2:]for i inrange(n-1)]+# transposition[word[0:i]+c+word[i+1:]for i inrange(n)for c in alphabet]+# alteration[word[0:i]+c+word[i:]for i inrange(n+1)for c in alphabet])# insertiondefknown_edits2(word):#返回的是,对编辑距离为1的单词在进行一次编辑距离加一的操作,总的来看就是编辑距离为2, 返回那些在语料库中存在的单词returnset(e2 for e1 in edits1(word)for e2 in edits1(e1)if e2 in NWORDS)defknown(words):returnset(w for w in words if w in NWORDS)#返回在语料库中出现过的正确的词汇defcorrect(word):
candidates = known([word])or known(edits1(word))or known_edits2(word)or[word]#一种列表的高级用法,只要向后的列表中有一个不为空就不在看后面的单词集合了returnmax(candidates, key=lambda w: NWORDS[w])#按照词频返回词频最大的单词