文本纠错方法

文本纠错方法

字典模板

在某些垂直领域,文本的标准化表述范围有限,因此构建一个领域字典是比较有效且容易见效的方法。比如在医疗,金融等领域,可以建立相关字典文件,如下:

红细胞, 红细胸 虹细胞 红组胞

总结一个错误表述到正确表述的词典,然后进行匹配替换就行。

**优点:**简单迅速、即插即用、0误判
**缺点:**覆盖度有限、需要大量人力、维护成本高、较为敏感(上游任何一个环节发生改变,可能词典就不适配)

一旦纠正了,就一定能纠正对,不存在又纠正错了的情况。

统计模型-编辑距离

英文版

class Candidate(object):
    #  WORDS_dict={word:freq}
    def __init__(self,WORDS_dict):
        self.WORDS=WORDS_dict

    def P(self,word):
        "Probability of `word`."
        # print(word,WORDS[word]/N)
        return self.WORDS[word] / sum(self.WORDS.values())

    def correction(self,word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key=self.P)

    def candidates(self,word):
        "Generate possible spelling corrections for word."
        return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])

    def known(self,words):
        "The subset of `words` that appear in the dictionary of WORDS."
        # print("word_list===>",set(w for w in words if w in WORDS))
        return set(w for w in words if w in self.WORDS)

    def edits1(self,word):
        "All edits that are one edit away from `word`. "
        # todo
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self,word):
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

中文版

class Candidate(object):
    def __init__(self, word_freq_path, char_path):
        self.WORDS = self._load_word_freq(word_freq_path)
        self.N = sum(self.WORDS.values()) 
        self.letters = self._load_letters(char_path)

    def _load_word_freq(self, word_freq_path):
        f = open(word_freq_path, "r", encoding='utf8')
        WORDS = {}
        id = 0
        for line in f.readlines():
            if line.strip():
                word_freq = line.split('\t')
                WORDS[word_freq[0]] = int(word_freq[1])

    def _load_letters(self, char_path):
        f = open(char_path, "r", encoding='utf8')
        letters = []
        for line in f.readlines():
            if line.strip():
                letters.append(line.strip())
        return letters

    def P(self, word):
        return self.WORDS[word] / self.N

    def know(self, words):
        return set(w for w in words if w in self.WORDS)

    def edits1(self, word):
        #letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] #切分
        deletes = [L + R[1:] for L, R in splits if R] #删除
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] #移位
        replaces = [L + c + R[1:] for L, R in splits for c in self.letters] #代替
        inserts = [L + c + R for L, R in splits for c in self.letters] #插入
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

    def candidates(self, word):
        return (self.know([word]) or self.know(self.edits1(word)) or self.know(self.edits2(word)) or [word])

    def correction(self, word):
        return max(self.candidates(word), key=self.P)

也可以通过拼音来进行纠正
char.txt文件形式

砏
䲢
䜣
䲡
凉

pinyin2word.model文件形式

'hun,shi': {'荤食': 3, '婚事': 376, '婚史': 9, '混世': 3, '混事': 9}, 'ban': {'扮': 980, '阪': 38, '辬': 2, '湴': 1211, '班': 4674, '绊': 361, '舨': 24, '岅': 2, '坂': 51, '钣': 46, '闆': 299, '半': 8201, '瓣': 466, '颁': 2636, '坢': 39, '攽': 12, '癍': 39, '瘢': 90, '拌': 1194, '版': 4626, '伴': 1567, '辦': 6, '扳': 656, '瓪': 2, '搬': 2869, '般': 3306, '斑': 1027, '板': 5049, '办': 10314}
from pypinyin import *

class WordCorrect:
    def __init__(self):
        self.char_path = 'char.txt'
        self.model_path = 'pinyin2word.model'
        self.charlist = [word.strip() for word in open(self.char_path, 'r', encoding='utf-8') if word.strip()]
        self.pinyin_dict = self.load_model(self.model_path)

    def load_model(self, model_path):
        f = open(model_path, 'r', encoding='utf-8')
        a = f.read()
        word_dict = eval(a)
        f.close()
        return word_dict

    def edit1(self, word):
        n = len(word)
        return set([word[0:i]+word[i+1:] for i in range(n)] +                     # deletion
                   [word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + # transposition
                   [word[0:i]+c+word[i+1:] for i in range(n) for c in self.charlist] + # alteration
                   [word[0:i]+c+word[i:] for i in range(n+1) for c in self.charlist])  # insertion

def build_model():
    word_dict = {}
    count = 0
    for line in open('dict.txt'):
        count += 1
        print(count)
        line = line.strip().split(' ')
        word = line[0]
        word_count = line[1]
        word_pinyin = ','.join(lazy_pinyin(word))
        if word_pinyin not in word_dict:
            word_dict[word_pinyin] = word + '_' + word_count
        else:
            word_dict[word_pinyin] += ';' + word + '_' + word_count

    data = {}
    for pinyin, words in word_dict.items():
        tmp = {}
        for word in words.split(';'):
            word_word = word.split('_')[0]
            word_count = int(word.split('_')[1])
            tmp[word_word] = word_count
        data[pinyin] = tmp


    f = open('pinyin2word.model', 'w')
    f.write(str(data))
    f.close()

corrector = WordCorrect()
word = '我门'
word_pinyin = ','.join(lazy_pinyin(word))
candiwords = corrector.edit1(word)
print(candiwords)
print(word_pinyin)
print(corrector.pinyin_dict.get(word_pinyin, 'na'))

深度学习模型

参考

QueryCorrection

基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 基于的BERT的文本纠错模型python源码+项目说明+数据集+详细注释.zip 【资源说明】 1、该资源内项目代码都是经过测试运行成功,功能正常的情况下才上传的,请放心下载使用。 2、适用人群:主要针对计算机相关专业(如计科、信息安全、数据科学与大数据技术、人工智能、通信、物联网、数学、电子信息等)的同学或企业员工下载使用,具有较高的学习借鉴价值。 3、不仅适合小白学习实战练习,也可作为大作业、课程设计、毕设项目、初期项目立项演示等,欢迎下载,互相学习,共同进步!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

发呆的比目鱼

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值