文本纠错方法
字典模板
在某些垂直领域,文本的标准化表述范围有限,因此构建一个领域字典是比较有效且容易见效的方法。比如在医疗,金融等领域,可以建立相关字典文件,如下:
红细胞, 红细胸 虹细胞 红组胞
总结一个错误表述到正确表述的词典,然后进行匹配替换就行。
**优点:**简单迅速、即插即用、0误判
**缺点:**覆盖度有限、需要大量人力、维护成本高、较为敏感(上游任何一个环节发生改变,可能词典就不适配)
一旦纠正了,就一定能纠正对,不存在又纠正错了的情况。
统计模型-编辑距离
英文版
class Candidate(object):
# WORDS_dict={word:freq}
def __init__(self,WORDS_dict):
self.WORDS=WORDS_dict
def P(self,word):
"Probability of `word`."
# print(word,WORDS[word]/N)
return self.WORDS[word] / sum(self.WORDS.values())
def correction(self,word):
"Most probable spelling correction for word."
return max(self.candidates(word), key=self.P)
def candidates(self,word):
"Generate possible spelling corrections for word."
return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])
def known(self,words):
"The subset of `words` that appear in the dictionary of WORDS."
# print("word_list===>",set(w for w in words if w in WORDS))
return set(w for w in words if w in self.WORDS)
def edits1(self,word):
"All edits that are one edit away from `word`. "
# todo
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(self,word):
"All edits that are two edits away from `word`."
return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
中文版
class Candidate(object):
def __init__(self, word_freq_path, char_path):
self.WORDS = self._load_word_freq(word_freq_path)
self.N = sum(self.WORDS.values())
self.letters = self._load_letters(char_path)
def _load_word_freq(self, word_freq_path):
f = open(word_freq_path, "r", encoding='utf8')
WORDS = {}
id = 0
for line in f.readlines():
if line.strip():
word_freq = line.split('\t')
WORDS[word_freq[0]] = int(word_freq[1])
def _load_letters(self, char_path):
f = open(char_path, "r", encoding='utf8')
letters = []
for line in f.readlines():
if line.strip():
letters.append(line.strip())
return letters
def P(self, word):
return self.WORDS[word] / self.N
def know(self, words):
return set(w for w in words if w in self.WORDS)
def edits1(self, word):
#letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] #切分
deletes = [L + R[1:] for L, R in splits if R] #删除
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] #移位
replaces = [L + c + R[1:] for L, R in splits for c in self.letters] #代替
inserts = [L + c + R for L, R in splits for c in self.letters] #插入
return set(deletes + transposes + replaces + inserts)
def edits2(self, word):
return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
def candidates(self, word):
return (self.know([word]) or self.know(self.edits1(word)) or self.know(self.edits2(word)) or [word])
def correction(self, word):
return max(self.candidates(word), key=self.P)
也可以通过拼音来进行纠正
char.txt
文件形式
砏
䲢
䜣
䲡
凉
pinyin2word.model
文件形式
'hun,shi': {'荤食': 3, '婚事': 376, '婚史': 9, '混世': 3, '混事': 9}, 'ban': {'扮': 980, '阪': 38, '辬': 2, '湴': 1211, '班': 4674, '绊': 361, '舨': 24, '岅': 2, '坂': 51, '钣': 46, '闆': 299, '半': 8201, '瓣': 466, '颁': 2636, '坢': 39, '攽': 12, '癍': 39, '瘢': 90, '拌': 1194, '版': 4626, '伴': 1567, '辦': 6, '扳': 656, '瓪': 2, '搬': 2869, '般': 3306, '斑': 1027, '板': 5049, '办': 10314}
from pypinyin import *
class WordCorrect:
def __init__(self):
self.char_path = 'char.txt'
self.model_path = 'pinyin2word.model'
self.charlist = [word.strip() for word in open(self.char_path, 'r', encoding='utf-8') if word.strip()]
self.pinyin_dict = self.load_model(self.model_path)
def load_model(self, model_path):
f = open(model_path, 'r', encoding='utf-8')
a = f.read()
word_dict = eval(a)
f.close()
return word_dict
def edit1(self, word):
n = len(word)
return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion
[word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + # transposition
[word[0:i]+c+word[i+1:] for i in range(n) for c in self.charlist] + # alteration
[word[0:i]+c+word[i:] for i in range(n+1) for c in self.charlist]) # insertion
def build_model():
word_dict = {}
count = 0
for line in open('dict.txt'):
count += 1
print(count)
line = line.strip().split(' ')
word = line[0]
word_count = line[1]
word_pinyin = ','.join(lazy_pinyin(word))
if word_pinyin not in word_dict:
word_dict[word_pinyin] = word + '_' + word_count
else:
word_dict[word_pinyin] += ';' + word + '_' + word_count
data = {}
for pinyin, words in word_dict.items():
tmp = {}
for word in words.split(';'):
word_word = word.split('_')[0]
word_count = int(word.split('_')[1])
tmp[word_word] = word_count
data[pinyin] = tmp
f = open('pinyin2word.model', 'w')
f.write(str(data))
f.close()
corrector = WordCorrect()
word = '我门'
word_pinyin = ','.join(lazy_pinyin(word))
candiwords = corrector.edit1(word)
print(candiwords)
print(word_pinyin)
print(corrector.pinyin_dict.get(word_pinyin, 'na'))