NLP基础之拼写纠错代码实现

最新推荐文章于 2022-01-10 18:20:27 发布

weixin_45599022

最新推荐文章于 2022-01-10 18:20:27 发布

阅读量470

点赞数 1

文章标签：自然语言处理 python

本文链接：https://blog.csdn.net/weixin_45599022/article/details/108286677

版权

# 第一步：构建词库 vocab网上搜，自己爬都行
vocab = set([line.rstrip() for line in open('./vocab.txt')])
vocab

输出：
{ ‘widths’, ‘truer’, …}

# 第二步： 生成编辑距离为1的有效单词
# 定义函数生成所有编辑距离为1的候选单词
def generate_candidates(word):
    """
    word: 给定的输入（错误的输入） 
    返回所有(valid)候选集合
    """

    letter = 'abcdefghijklmnopqrstuvwxyz'
    splite = [(word[:i], word[i:]) for i in range(len(word)+1)]
    # delete操作
    delete = [(l+r[1:]) for l,r in splite]
    # insert 操作
    insert = [(l+c+r) for l,r in splite for c in letter]
    # replace操作
    replace = [(l+c+r[1:]) for l, r in splite for c in letter]

    words = set(replace + insert + delete)
    condidates = [word for word in words if word in vocab ]
    return condidates

generate_candidates('apple')

['apples', 'apply', 'apple', 'ample']

# 第三步：通过语料库构建LM模型（bigram）
from nltk.corpus import reuters
categories =reuters.categories()
corpus = reuters.sents(categories = categories)


term_count = {}
bigram_count ={}
for doc in corpus:
    doc = ['<s>'] + doc
    for i in range(0,len(doc)-1):
        term = doc[i]
        bigram = doc[i:i+2]
        if term in term_count:
            term_count[term] +=1
        else:
            term_count[term] = 1
        bigram = ''.join(bigram)
        if bigram in bigram_count:
            bigram_count[bigram] +=1
        else:
            bigram_count[bigram] = 1
            
print(bigram)

6mln

# 第四步： 用户打错的概率统计
#（这里实际是通过用户日志统计每个正确单词对应的错误单词的种类
# 及次数来生成错误单词的概率P(mistake1|correct)）,P(mistake2|correct)）...
# {'raining': {'rainning': 0.5, 'raning': 0.5}, ...}
# 本项目是假设各错误情况出现的概率相等
channel_prob = {}
for line in open('spell-errors.txt'):
    item = line.split(':')
    correct = item[0].strip()
    mistake = [misword.strip() for misword in item[1].strip().split(',')]
    channel_prob[correct] = {}
    for mis in mistake:
        channel_prob[correct][mis]=1/len(mistake)

{'raining': {'rainning': 0.5, 'raning': 0.5}, 'writings': {'writtings': 1.0}, 'disparagingly': {'disparingly': 1.0}, 'yellow': {'yello': 1.0}, 'four': {'forer': 0.2, 'fours': 0.2, 'fuore': 0.2, 'fore*5': 0.2, 'for*4': 0.2}, 'woods': {'woodes': 1.0}, 'hanging': {'haing': 1.0}, 'aggression': {'agression': 1.0}, 'looking': {'loking': 0.1, 'begining': 0.1, 'luing': 0.1, 'look*2': 0.1, 'locking': 0.1, 'lucking': 0.1, 'louk': 0.1, 'looing': 0.1, 'lookin': 0.1, 'liking': 0.1},  'misdemeanors': {'misdameanors': 0.5, 'misdemenors': 0.5}

# 第五步： 测试数据错误单词纠错。
import numpy as np
V = len(term_count.keys()) # 语料库里的单词种类个数

file = open("testdata.txt", 'r')
for line in file:
    items = line.rstrip().split('\t')
    line = items[2].rstrip('.').split() # line = ["I", "like", "playing"]
    for word in line:
        # 找出没在词库里的单词，（即认为是拼写错误的）
        if word not in vocab:
            # Step1: 生成所有的(valid)候选集合
            candidates = generate_candidates(word)
            if len(candidates) < 1:
                continue  #(最好是再生成编辑距离为2的候选词进行比较
                            #，这里候选词没有的话直接跳过)
            probs = []
            # 对于每一个candidate, 计算它的score
            # score = p(correct)*p(mistake|correct)
            #       = log p(correct) + log p(mistake|correct)
            # 返回score最大的candidate
            for candi in candidates:
                prob = 0
                # a. 计算channel probability
                if candi in channel_prob and word in channel_prob[candi]:
                    prob += np.log(channel_prob[candi][word])
                else:
                    prob += np.log(0.0001)
                # b. 计算语言模型的概率  
                idx = line.index(word)  # 错误单词的位置索引
                bigram = [line[idx - 1],candi]
                bigram = ''.join(bigram)
                if bigram in bigram_count and candi in term_count:
                    # 计算当前word 与pre_word 条件概率 
                    # log(P（word\pre_word)) = P(pre_word,word)/P(word)
                    prob += np.log((bigram_count[bigram]+1)/(term_count[candi]+V))
                else:
                    prob += np.log(1.0 / V)  
                # 计算 [word, post_word] 条件概率
                #   prob += np.log(bigram概率)     
                if idx + 1 < len(line): #（最后一个单词出错的话直接跳过该步骤）
                    bigram = [candi,line[idx + 1]]
                    bigram = ''.join(bigram)

                    if bigram in bigram_count and candi in term_count:
                        prob += np.log((bigram_count[bigram]+1)/(term_count[candi]+V))
                    else:
                        prob += np.log(1.0 / V)
                # prob ：分数 将计算的分数放入列表
                #如： candidates: ['apples', 'apply', 'apple', 'ample']
                     # probs:    [0.8, 0.2, 0.1, 0.1] 
                     # 分数与候选词相互对应选最大分数那个
                probs.append(prob)
            max_idx = probs.index(max(probs))
            print (word, candidates[max_idx])

weixin_45599022

关注

1
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
NLP基础之拼写纠错代码实现

# 第一步：构建词库 vocab网上搜，自己爬都行vocab = set([line.rstrip() for line in open('./vocab.txt')])vocab输出：{ ‘widths’, ‘truer’, …}# 第二步：生成编辑距离为1的有效单词# 定义函数生成所有编辑距离为1的候选单词def generate_candidates(word): """ word: 给定的输入（错误的输入）返回所有(valid)候选集合 """
复制链接

扫一扫