单词拼写纠错

6 篇文章 0 订阅
4 篇文章 0 订阅
from nltk.corpus import reuters
from docx import Document
from nltk import sent_tokenize, word_tokenize
import re
import numpy as np

#词典库
vocab = []
for line in open("data/spell-testset1.txt"):
    items = line.split(":")
    item = items[0].strip()
    vocab.append(item)
vocabs = set(vocab)
# print(vocabs)

#生成所有候选集合
def generate_candidates(word):
    """
    word:给定输入(错误的输入)
    返回所有(valid)候选集合
    """
    #生成编辑距离为1的单词 1.insert 2.delete 3.replace
    #假设使用26个字符
    letters = "abcdefghijklmnopqrstuvwxyz"

    splits  = [(word[:i],word[i:]) for i in range(len(word)+1)]
    # print(splits)
    #1.insert
    inserts = [L+c+R for L,R in splits for c in letters]
    # print(len(inserts))
    #2.delete
    deletes = [L+R[1:]for L,R in splits]
    # print(len(deletes))
    #3.replace
    replaces = [L+c+R[1:] for L,R in splits for c in letters]
    # print(len(replaces))

    candidate = set(inserts+deletes+replaces)

    return candidate

def generate_edit_two(str):
    """
    给定一个字符串,生成编辑距离不大于2的字符串
    :param str:
    :return:
    """
    candi = []
    for e1 in generate_candidates(str):
        candidates = generate_candidates((e1))
        candi +=candidates
    return [word for word in candi if word in vocabs]

#读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)

#构建语言模型:bi_gram
term_count = {}
bigram_count = {}
for doc in corpus:
    doc = ["<s>"] +doc
    for i in range(0,len(doc)-1):
        term = doc[i]
        bigram = doc[i:i+2]#bigram:[i,i+1]

        if term in term_count :
            term_count [term] +=1
        else:
            term_count[term] = 1
        bigram = " ".join(bigram)
        if bigram in bigram_count:
            bigram_count[bigram] +=1
        else:
            bigram_count[bigram] = 1
# print(term_count)
# print(bigram_count)

#用户打错的概率 channel probability
channel_prob = {}
for line in open("data/spell-testset1.txt"):
    items = line.split(":")
    correct = items[0].strip()
    mistake = [item.strip() for item in items[1].strip().split(" ")]
    channel_prob[correct] = {}
    for mis in mistake:
        channel_prob[correct][mis] = 1.0/len(mistake)
# print(channel_prob)

punctuation = '.!,;:?"\''
def removePunctuation(text):
    text = re.sub(r'[{}]+'.format(punctuation),'',text)
    return text.strip().lower()

#测试
V = len(term_count.keys())
doc = Document("data/Spelling Error.docx")
for i in range(len(doc.paragraphs)):
    # 每一段的内容
    paragraph = doc.paragraphs[i].text.strip()
    # 进行句子划分
    sentences = sent_tokenize(text=paragraph)
    # 词语划分
    words_list = [word_tokenize(removePunctuation(sentence)) for sentence in sentences]

    document = Document()
    p = document.add_paragraph(' '*7)  # 段落句柄
    # print(words_list)

    #words_list=[["I","like","apple"],["I","would","like","to","have","a","party"]]
    for word_list in words_list:
        for word in word_list:
            if word not in vocabs:
                # 需要替换word为正确的单词
                #step1: 生成所有vaild候选集合
                #if candidates=[],尝试利用编辑距离为2,3多生成candidates
                candidate = generate_candidates(word)
                # 过滤掉不存在词典库的单词
                words = [word for word in candidate if word in vocabs]
                if len(words) < 3:
                    candidates = generate_edit_two(word)
                else:
                    candidates = words

                probs = []
                #对于每一个candidate,计算它的score
                #score = p(correct)*p(mistake/correct)
                #      = log p(correct) + log p(mistake/correct)
                #返回score最大的candidate
                for candi in candidates:
                    prob = 0
                    # a.计算channel probability
                    if candi in channel_prob and word in channel_prob[candi]:
                        prob +=np.log(channel_prob[candi][word])
                    else:
                        prob +=np.log(0.00001)
                    # b.计算LM probability
                    #p(like) = p(like/would)*p(like/to)
                    #        = c(like would)/c(would) *c(like to)/c(to)
                    #        = c(like would)+1/c(would)+V (smoothing)
                    #        = log (c(lik would)+1) - log (c(would)+V)
                    idx = word_list.index(word)
                    if word_list[idx-1] in bigram_count and candi in bigram_count:
                        prob += np.log((bigram_count[word_list[idx-1]][candi]+1.0)/
                                       (term_count[bigram_count[word_list[idx-1]]]+V))
                    else:
                        prob +=np.log(1.0/V)

                    probs.append(prob)

                max_idx = probs.index(max(probs))
                print(word,candidates[max_idx])


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值