NLPCamp-SpellCorrection

2 篇文章 0 订阅

SpellCorrection


# 词典库
vocab = set([line.rstrip() for line in open('vocab.txt')])
# 需要生成所有候选集合
def generate_candidates(word):
    """
    word: 给定的输入(错误的输入) 
    返回所有(valid)候选集合
    """
    # 生成编辑距离为1的单词
    # 1.insert 2. delete 3. replace
    # appl: replace: bppl, cppl, aapl, abpl... 
    #       insert: bappl, cappl, abppl, acppl....
    #       delete: ppl, apl, app
    
    # 假设使用26个字符
    letters = 'abcdefghijklmnopqrstuvwxyz' 
    
    splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
    # insert操作
    inserts = [L+c+R for L, R in splits for c in letters]
    # delete
    deletes = [L+R[1:] for L,R in splits if R]
    # replace
    replaces = [L+c+R[1:] for L,R in splits if R for c in letters]
    
    candidates = set(inserts+deletes+replaces)
    
    # 过来掉不存在于词典库里面的单词
    return [word for word in candidates if word in vocab] 
    
generate_candidates("apple")
['ample', 'apples', 'apply', 'apple']
from nltk.corpus import reuters

# 读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
# 构建语言模型: bigram
term_count = {}
bigram_count = {}
for doc in corpus:
    doc = ['<s>'] + doc
    for i in range(0, len(doc)-1):
        # bigram: [i,i+1]
        term = doc[i]
        bigram = doc[i:i+2]
        
        if term in term_count:
            term_count[term]+=1
        else:
            term_count[term]=1
        bigram = ' '.join(bigram)
        if bigram in bigram_count:
            bigram_count[bigram]+=1
        else:
            bigram_count[bigram]=1

# sklearn里面有现成的包
# 用户打错的概率统计 - channel probability
channel_prob = {}

for line in open('spell-errors.txt'):
    items = line.split(":")
    correct = items[0].strip()
    
    # EDIT START
    # 数据中的*可能为词频
    mistakes = [item.strip().split("*") for item in items[1].strip().split(",")]
    mistakes = [[mis[0], 1] if len(mis) == 1 else [mis[0], int(mis[1])] for mis in mistakes]
    mis_count = sum([mis[1] for mis in mistakes])
    channel_prob[correct] = {}
    for mis in mistakes:
        channel_prob[correct][mis[0]] = mis[1] / mis_count
    # EDIT END

print(channel_prob)   
{'raining': {'rainning': 0.5, 'raning': 0.5}, 'writings': {'writtings': 1.0}, 'disparagingly': {'disparingly': 1.0}, 'yellow': {'yello': 1.0}, 'four': {'forer': 0.08333333333333333, 'fours': 0.08333333333333333, 'fuore': 0.08333333333333333, 'fore': 0.4166666666666667, 'for': 0.3333333333333333}, 'woods': {'woodes': 1.0}, 'hanging': {'haing': 1.0}, 'aggression': {'agression': 1.0}, 'looking': {'loking': 0.09090909090909091, 'begining': 0.09090909090909091, 'luing': 0.09090909090909091, 'look': 0.18181818181818182, 'locking': 0.09090909090909091, 'lucking': 0.09090909090909091, 'louk': 0.09090909090909091, 'looing': 0.09090909090909091, 'lookin': 0.09090909090909091, 'liking': 0.09090909090909091},
	...
import numpy as np
V = len(term_count.keys())

file = open("testdata.txt", 'r')
for line in file:
    items = line.rstrip().split('\t')
    line = items[2].split()
    # line = ["I", "like", "playing"]
    for word in line:
        if word not in vocab:
            # 需要替换word成正确的单词
            # Step1: 生成所有的(valid)候选集合
            candidates = generate_candidates(word)
            
            # 一种方式: if candidate = [], 多生成几个candidates, 比如生成编辑距离不大于2的
            # TODO : 根据条件生成更多的候选集合
            
            # EDIT START
            # 获得编辑距离不大于2的candidates
            for i in range(len(candidates)):
                candidates += generate_candidates(candidates[i])
            candidates = list(set(candidates))
            # EDIT END
            
            if len(candidates) < 1:
                continue   # 不建议这么做(这是不对的) 
            
            probs = []
            # 对于每一个candidate, 计算它的score
            # score = p(correct)*p(mistake|correct)
            #       = log p(correct) + log p(mistake|correct)
            # 返回score最大的candidate
            for candi in candidates:
                prob = 0
                # a. 计算channel probability
                if candi in channel_prob and word in channel_prob[candi]:
                    prob += np.log(channel_prob[candi][word])
                else:
                    prob += np.log(0.0001)
                
                # b. 计算语言模型的概率
                # EDIT START
                idx = line.index(word)
                bigram = ['<s>', candi] if idx == 0 else [line[idx - 1], candi]
                if " ".join(bigram) in bigram_count:
                    prob += np.log((bigram_count[" ".join(bigram)] + 1.0) / (
                            term_count[bigram[0]] + V))
                # EDIT END
                # TODO: 也要考虑当前 [word, post_word]
                #   prob += np.log(bigram概率)
                
                else:
                    prob += np.log(1.0 / V)

                probs.append(prob)
            
#             # EDIT START
#             # 原代码存在错误:probs几乎只有一个数,求最大值实际无意义
#             # 可解除该部分代码注释观察probs
#             print(set(probs))
#             # EDIT END
                
            max_idx = probs.index(max(probs))
            print (word, candidates[max_idx])
protectionst protectionist
products. products
long-run, longrun
gain. gain
17, 1
retaiation retaliation
 ...

P.S.

若读取nltk.corpus语料库时出现问题,尝试下载全部nltk_data
若遍历corpus过程中出现问题,尝试按照错误提示路径手动解压punkt并创建相应文件夹结构

# EDIT START
# EDIT START -> END: Edit by ziuno, for reference only.
# EDIT END
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值