from nltk.corpus import reuters
from docx import Document
from nltk import sent_tokenize, word_tokenize
import re
import numpy as np
#词典库
vocab = []
for line in open("data/spell-testset1.txt"):
items = line.split(":")
item = items[0].strip()
vocab.append(item)
vocabs = set(vocab)
# print(vocabs)
#生成所有候选集合
def generate_candidates(word):
"""
word:给定输入(错误的输入)
返回所有(valid)候选集合
"""
#生成编辑距离为1的单词 1.insert 2.delete 3.replace
#假设使用26个字符
letters = "abcdefghijklmnopqrstuvwxyz"
splits = [(word[:i],word[i:]) for i in range(len(word)+1)]
# print(splits)
#1.insert
inserts = [L+c+R for L,R in splits for c in letters]
# print(len(inserts))
#2.delete
deletes = [L+R[1:]for L,R in splits]
# print(len(deletes))
#3.replace
replaces = [L+c+R[1:] for L,R in splits for c in letters]
# print(len(replaces))
candidate = set(inserts+deletes+replaces)
return candidate
def generate_edit_two(str):
"""
给定一个字符串,生成编辑距离不大于2的字符串
:param str:
:return:
"""
candi = []
for e1 in generate_candidates(str):
candidates = generate_candidates((e1))
candi +=candidates
return [word for word in candi if word in vocabs]
#读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
#构建语言模型:bi_gram
term_count = {}
bigram_count = {}
for doc in corpus:
doc = ["<s>"] +doc
for i in range(0,len(doc)-1):
term = doc[i]
bigram = doc[i:i+2]#bigram:[i,i+1]
if term in term_count :
term_count [term] +=1
else:
term_count[term] = 1
bigram = " ".join(bigram)
if bigram in bigram_count:
bigram_count[bigram] +=1
else:
bigram_count[bigram] = 1
# print(term_count)
# print(bigram_count)
#用户打错的概率 channel probability
channel_prob = {}
for line in open("data/spell-testset1.txt"):
items = line.split(":")
correct = items[0].strip()
mistake = [item.strip() for item in items[1].strip().split(" ")]
channel_prob[correct] = {}
for mis in mistake:
channel_prob[correct][mis] = 1.0/len(mistake)
# print(channel_prob)
punctuation = '.!,;:?"\''
def removePunctuation(text):
text = re.sub(r'[{}]+'.format(punctuation),'',text)
return text.strip().lower()
#测试
V = len(term_count.keys())
doc = Document("data/Spelling Error.docx")
for i in range(len(doc.paragraphs)):
# 每一段的内容
paragraph = doc.paragraphs[i].text.strip()
# 进行句子划分
sentences = sent_tokenize(text=paragraph)
# 词语划分
words_list = [word_tokenize(removePunctuation(sentence)) for sentence in sentences]
document = Document()
p = document.add_paragraph(' '*7) # 段落句柄
# print(words_list)
#words_list=[["I","like","apple"],["I","would","like","to","have","a","party"]]
for word_list in words_list:
for word in word_list:
if word not in vocabs:
# 需要替换word为正确的单词
#step1: 生成所有vaild候选集合
#if candidates=[],尝试利用编辑距离为2,3多生成candidates
candidate = generate_candidates(word)
# 过滤掉不存在词典库的单词
words = [word for word in candidate if word in vocabs]
if len(words) < 3:
candidates = generate_edit_two(word)
else:
candidates = words
probs = []
#对于每一个candidate,计算它的score
#score = p(correct)*p(mistake/correct)
# = log p(correct) + log p(mistake/correct)
#返回score最大的candidate
for candi in candidates:
prob = 0
# a.计算channel probability
if candi in channel_prob and word in channel_prob[candi]:
prob +=np.log(channel_prob[candi][word])
else:
prob +=np.log(0.00001)
# b.计算LM probability
#p(like) = p(like/would)*p(like/to)
# = c(like would)/c(would) *c(like to)/c(to)
# = c(like would)+1/c(would)+V (smoothing)
# = log (c(lik would)+1) - log (c(would)+V)
idx = word_list.index(word)
if word_list[idx-1] in bigram_count and candi in bigram_count:
prob += np.log((bigram_count[word_list[idx-1]][candi]+1.0)/
(term_count[bigram_count[word_list[idx-1]]]+V))
else:
prob +=np.log(1.0/V)
probs.append(prob)
max_idx = probs.index(max(probs))
print(word,candidates[max_idx])
单词拼写纠错
最新推荐文章于 2024-10-08 12:37:10 发布