NLP基础之拼写纠错代码实现

# 第一步:构建词库 vocab网上搜,自己爬都行
vocab = set([line.rstrip() for line in open('./vocab.txt')])
vocab

输出:
{ ‘widths’, ‘truer’, …}

# 第二步: 生成编辑距离为1的有效单词
# 定义函数生成所有编辑距离为1的候选单词
def generate_candidates(word):
    """
    word: 给定的输入(错误的输入) 
    返回所有(valid)候选集合
    """

    letter = 'abcdefghijklmnopqrstuvwxyz'
    splite = [(word[:i], word[i:]) for i in range(len(word)+1)]
    # delete操作
    delete = [(l+r[1:]) for l,r in splite]
    # insert 操作
    insert = [(l+c+r) for l,r in splite for c in letter]
    # replace操作
    replace = [(l+c+r[1:]) for l, r in splite for c in letter]

    words = set(replace + insert + delete)
    condidates = [word for word in words if word in vocab ]
    return condidates

generate_candidates('apple')
['apples', 'apply', 'apple', 'ample']
# 第三步:通过语料库构建LM模型(bigram)
from nltk.corpus import reuters
categories =reuters.categories()
corpus = reuters.sents(categories = categories)

term_count = {}
bigram_count ={}
for doc in corpus:
    doc = ['<s>'] + doc
    for i in range(0,len(doc)-1):
        term = doc[i]
        bigram = doc[i:i+2]
        if term in term_count:
            term_count[term] +=1
        else:
            term_count[term] = 1
        bigram = ''.join(bigram)
        if bigram in bigram_count:
            bigram_count[bigram] +=1
        else:
            bigram_count[bigram] = 1
            
print(bigram)
6mln
# 第四步: 用户打错的概率统计
#(这里实际是通过用户日志统计每个正确单词对应的错误单词的种类
# 及次数来生成错误单词的概率P(mistake1|correct)),P(mistake2|correct))...
# {'raining': {'rainning': 0.5, 'raning': 0.5}, ...}
# 本项目是假设各错误情况出现的概率相等
channel_prob = {}
for line in open('spell-errors.txt'):
    item = line.split(':')
    correct = item[0].strip()
    mistake = [misword.strip() for misword in item[1].strip().split(',')]
    channel_prob[correct] = {}
    for mis in mistake:
        channel_prob[correct][mis]=1/len(mistake)
{'raining': {'rainning': 0.5, 'raning': 0.5}, 'writings': {'writtings': 1.0}, 'disparagingly': {'disparingly': 1.0}, 'yellow': {'yello': 1.0}, 'four': {'forer': 0.2, 'fours': 0.2, 'fuore': 0.2, 'fore*5': 0.2, 'for*4': 0.2}, 'woods': {'woodes': 1.0}, 'hanging': {'haing': 1.0}, 'aggression': {'agression': 1.0}, 'looking': {'loking': 0.1, 'begining': 0.1, 'luing': 0.1, 'look*2': 0.1, 'locking': 0.1, 'lucking': 0.1, 'louk': 0.1, 'looing': 0.1, 'lookin': 0.1, 'liking': 0.1},  'misdemeanors': {'misdameanors': 0.5, 'misdemenors': 0.5}
# 第五步: 测试数据错误单词纠错。
import numpy as np
V = len(term_count.keys()) # 语料库里的单词种类个数

file = open("testdata.txt", 'r')
for line in file:
    items = line.rstrip().split('\t')
    line = items[2].rstrip('.').split() # line = ["I", "like", "playing"]
    for word in line:
        # 找出没在词库里的单词,(即认为是拼写错误的)
        if word not in vocab:
            # Step1: 生成所有的(valid)候选集合
            candidates = generate_candidates(word)
            if len(candidates) < 1:
                continue  #(最好是再生成编辑距离为2的候选词进行比较
                            #,这里候选词没有的话直接跳过)
            probs = []
            # 对于每一个candidate, 计算它的score
            # score = p(correct)*p(mistake|correct)
            #       = log p(correct) + log p(mistake|correct)
            # 返回score最大的candidate
            for candi in candidates:
                prob = 0
                # a. 计算channel probability
                if candi in channel_prob and word in channel_prob[candi]:
                    prob += np.log(channel_prob[candi][word])
                else:
                    prob += np.log(0.0001)
                # b. 计算语言模型的概率  
                idx = line.index(word)  # 错误单词的位置索引
                bigram = [line[idx - 1],candi]
                bigram = ''.join(bigram)
                if bigram in bigram_count and candi in term_count:
                    # 计算当前word 与pre_word 条件概率 
                    # log(P(word\pre_word)) = P(pre_word,word)/P(word)
                    prob += np.log((bigram_count[bigram]+1)/(term_count[candi]+V))
                else:
                    prob += np.log(1.0 / V)  
                # 计算 [word, post_word] 条件概率
                #   prob += np.log(bigram概率)     
                if idx + 1 < len(line): #(最后一个单词出错的话直接跳过该步骤)
                    bigram = [candi,line[idx + 1]]
                    bigram = ''.join(bigram)

                    if bigram in bigram_count and candi in term_count:
                        prob += np.log((bigram_count[bigram]+1)/(term_count[candi]+V))
                    else:
                        prob += np.log(1.0 / V)
                # prob :分数 将计算的分数放入列表
                #如: candidates: ['apples', 'apply', 'apple', 'ample']
                     # probs:    [0.8, 0.2, 0.1, 0.1] 
                     # 分数与候选词相互对应选最大分数那个
                probs.append(prob)
            max_idx = probs.index(max(probs))
            print (word, candidates[max_idx])
  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值