# 第一步:构建词库 vocab网上搜,自己爬都行
vocab = set([line.rstrip() for line in open('./vocab.txt')])
vocab
输出:
{ ‘widths’, ‘truer’, …}
# 第二步: 生成编辑距离为1的有效单词
# 定义函数生成所有编辑距离为1的候选单词
def generate_candidates(word):
"""
word: 给定的输入(错误的输入)
返回所有(valid)候选集合
"""
letter = 'abcdefghijklmnopqrstuvwxyz'
splite = [(word[:i], word[i:]) for i in range(len(word)+1)]
# delete操作
delete = [(l+r[1:]) for l,r in splite]
# insert 操作
insert = [(l+c+r) for l,r in splite for c in letter]
# replace操作
replace = [(l+c+r[1:]) for l, r in splite for c in letter]
words = set(replace + insert + delete)
condidates = [word for word in words if word in vocab ]
return condidates
generate_candidates('apple')
['apples', 'apply', 'apple', 'ample']
# 第三步:通过语料库构建LM模型(bigram)
from nltk.corpus import reuters
categories =reuters.categories()
corpus = reuters.sents(categories = categories)
term_count = {}
bigram_count ={}
for doc in corpus:
doc = ['<s>'] + doc
for i in range(0,len(doc)-1):
term = doc[i]
bigram = doc[i:i+2]
if term in term_count:
term_count[term] +=1
else:
term_count[term] = 1
bigram = ''.join(bigram)
if bigram in bigram_count:
bigram_count[bigram] +=1
else:
bigram_count[bigram] = 1
print(bigram)
6mln
# 第四步: 用户打错的概率统计
#(这里实际是通过用户日志统计每个正确单词对应的错误单词的种类
# 及次数来生成错误单词的概率P(mistake1|correct)),P(mistake2|correct))...
# {'raining': {'rainning': 0.5, 'raning': 0.5}, ...}
# 本项目是假设各错误情况出现的概率相等
channel_prob = {}
for line in open('spell-errors.txt'):
item = line.split(':')
correct = item[0].strip()
mistake = [misword.strip() for misword in item[1].strip().split(',')]
channel_prob[correct] = {}
for mis in mistake:
channel_prob[correct][mis]=1/len(mistake)
{'raining': {'rainning': 0.5, 'raning': 0.5}, 'writings': {'writtings': 1.0}, 'disparagingly': {'disparingly': 1.0}, 'yellow': {'yello': 1.0}, 'four': {'forer': 0.2, 'fours': 0.2, 'fuore': 0.2, 'fore*5': 0.2, 'for*4': 0.2}, 'woods': {'woodes': 1.0}, 'hanging': {'haing': 1.0}, 'aggression': {'agression': 1.0}, 'looking': {'loking': 0.1, 'begining': 0.1, 'luing': 0.1, 'look*2': 0.1, 'locking': 0.1, 'lucking': 0.1, 'louk': 0.1, 'looing': 0.1, 'lookin': 0.1, 'liking': 0.1}, 'misdemeanors': {'misdameanors': 0.5, 'misdemenors': 0.5}
# 第五步: 测试数据错误单词纠错。
import numpy as np
V = len(term_count.keys()) # 语料库里的单词种类个数
file = open("testdata.txt", 'r')
for line in file:
items = line.rstrip().split('\t')
line = items[2].rstrip('.').split() # line = ["I", "like", "playing"]
for word in line:
# 找出没在词库里的单词,(即认为是拼写错误的)
if word not in vocab:
# Step1: 生成所有的(valid)候选集合
candidates = generate_candidates(word)
if len(candidates) < 1:
continue #(最好是再生成编辑距离为2的候选词进行比较
#,这里候选词没有的话直接跳过)
probs = []
# 对于每一个candidate, 计算它的score
# score = p(correct)*p(mistake|correct)
# = log p(correct) + log p(mistake|correct)
# 返回score最大的candidate
for candi in candidates:
prob = 0
# a. 计算channel probability
if candi in channel_prob and word in channel_prob[candi]:
prob += np.log(channel_prob[candi][word])
else:
prob += np.log(0.0001)
# b. 计算语言模型的概率
idx = line.index(word) # 错误单词的位置索引
bigram = [line[idx - 1],candi]
bigram = ''.join(bigram)
if bigram in bigram_count and candi in term_count:
# 计算当前word 与pre_word 条件概率
# log(P(word\pre_word)) = P(pre_word,word)/P(word)
prob += np.log((bigram_count[bigram]+1)/(term_count[candi]+V))
else:
prob += np.log(1.0 / V)
# 计算 [word, post_word] 条件概率
# prob += np.log(bigram概率)
if idx + 1 < len(line): #(最后一个单词出错的话直接跳过该步骤)
bigram = [candi,line[idx + 1]]
bigram = ''.join(bigram)
if bigram in bigram_count and candi in term_count:
prob += np.log((bigram_count[bigram]+1)/(term_count[candi]+V))
else:
prob += np.log(1.0 / V)
# prob :分数 将计算的分数放入列表
#如: candidates: ['apples', 'apply', 'apple', 'ample']
# probs: [0.8, 0.2, 0.1, 0.1]
# 分数与候选词相互对应选最大分数那个
probs.append(prob)
max_idx = probs.index(max(probs))
print (word, candidates[max_idx])