智能纠错【优化版】

import jieba

from pypinyin import pinyin, lazy_pinyin
from common.basicInfo import BasicInfo
"""纠错模块"""


class ErrorRecovery(object):

    def __init__(self, dataSever):

        self.dataServer = dataSever
        self.symbol_list = self.dataServer.symbol_list
        self.entity_list = self.dataServer.entity_list

    """ ===============一定要带标点符号==========="""
    def recoveryMain(self, sentence,entity_list=None):

        word_list, word_gram_list = self.gram2Main(sentence)
        user_pin_list =[]
        user_chines_list=[]
        for item in word_gram_list:
            user_chines_list.append(item)
            pin_list = lazy_pinyin(item)  #, errors='ignore'
            user_pin_list.append(pin_list)
        mohu_list = []
        mohu_list2 = []
        
        for k, v in self.dataServer.dict_gupiao_pin.items():
            for user_pin in user_pin_list:
                indexs =user_pin_list.index(user_pin)
                if v == user_pin:
                    #todo 完全匹配,直接return
                    if self.common_Chinese(k, indexs, user_chines_list) >= 2:
                        accurate_list = []
                        save=[]
                        save.append(k)
                        save.append(user_pin)
                        accurate_list.append(save)
                        self.sensitiveWord(word_list)
                        return self.restoreQue(None, accurate_list, user_pin_list, word_gram_list, word_list)
                else:
                    #todo 模糊匹配,不能直接return,完全匹配具有最高的优先级
                    if len(v) >= 4 and self.unionlen(user_pin, v) >= 3:  #todo 4个字的关键字大于3个字的优先级
                        if self.common_Chinese(k, indexs, user_chines_list) >= 2:
                            save=[]
                            save.append(k)
                            save.append(user_pin)
                            mohu_list.append(save)
                    elif len(v) == 3 and self.unionlen(user_pin, v) >= 2:
                        if self.common_Chinese(k,indexs, user_chines_list) >= 1:
                            save = []
                            save.append(k)
                            save.append(user_pin)
                            mohu_list2.append(save)

        if len(mohu_list) > 0:
            self.sensitiveWord(word_list)
            return self.restoreQue(None, mohu_list, user_pin_list, word_gram_list, word_list)
        elif len(mohu_list2) > 0:
            self.sensitiveWord(word_list)
            return self.restoreQue(None, mohu_list2, user_pin_list, word_gram_list, word_list)

    def common_Chinese(self, k, indexs, user_chines_list):
        user_word = user_chines_list[indexs]
        orginal_word = BasicInfo.get_value(self.dataServer.new_gupiao, k)
        # print("识别纠错==="+orginal_word)
        ret = [i for i in orginal_word if i in user_word]
        return len(ret)

    def sensitiveWord(self, word_list):
        if 'englishA' in word_list:
            word_list[word_list.index('englishA')] = "好股"

    #问句排查错误后,重组问题
    def restoreQue(self, id=None, accu_list=None, user_pin_list=None,word_gram_list=None, word_list=None):
        if id is None:
            y=0
        else:
            #todo
            print("=======")

        k = accu_list[y][0]
        user_pin = accu_list[y][1]
        index = user_pin_list.index(user_pin)
        shortence = word_gram_list[index]
        seg_list = jieba.cut(shortence, cut_all=False, HMM=True)
        short_list = [e for e in seg_list]
        ret = []
        ret.append(k)
        if short_list[0] in word_list:
            word_list[word_list.index(short_list[0])] = self.entity_list[0]
            del word_list[word_list.index(self.entity_list[0]) + 1]
        else:
            print("========[log]errorRecovery.py====纠错error===========")
        for word in word_list:
            for symbol in self.symbol_list:
                if symbol in word or '\\ue' in word:
                    word_list.remove(word)
        ret.append(word_list)
        return ret

    def gram2Main(self, sentence):
        sentence = ''.join(sentence.split())
        seg_list = jieba.cut(sentence, cut_all=False, HMM=True)
        word_list = [e for e in seg_list]
        if '好股' in word_list:
            word_list[word_list.index('好股')] = "englishA"
        # print("纠错====="+str(word_list))
        return self.test2gram(word_list)

    def test2gram(self, list2=None):
        word_gram_list = []
        for i in range(len(list2)-1):
            ce = list2[i]+list2[i+1]
            word_gram_list.append(ce)
        return list2, word_gram_list

    def test3gram(self, list2=None):     #todo 启用3-gram进行优化,先选用3-gram完全匹配,若未找到,则进行2-gram的模糊匹配
        word_gram3_list = []
        for i in range(len(list2) - 2):
            ce = list2[i] + list2[i + 1] + list2[i + 2]
            word_gram3_list.append(ce)
        return list2, word_gram3_list

    #"""**该方法被遗弃"""
    # def findok(self, v, user_pin):
    #     if len(v) >= 4 and self.unionlen(user_pin, v) >= 3:
    #         return True
    #     elif len(v) == 3 and self.unionlen(user_pin, v) >= 2:
    #         return True
    #     return False

    """**blist 为字典,len(blist)为每项字典长度列表(该方法被遗弃)"""
    def unionlen2(self, alist, blist):
        ret = []
        if len(alist) <= len(blist):
            for i in range(len(alist)):
                if alist[i] in blist:
                    ret.append(alist)
            return len(ret)

        else:
            for i in range(len(blist)):
                if alist[i] in blist:
                    ret.append(alist)
            return len(ret)

    def unionlen(self, alist, blist):
        length = 0
        if len(alist) <= len(blist):
            for i in range(len(alist)):
                if alist[i] == blist[i]:
                    length =length + 1
        else:
            for i in range(len(blist)):
                if blist[i] == alist[i]:
                    length=length+1
        return length


# if __name__ == '__main__':
#     dataServer = DataServer()
#     erRecovery = ErrorRecovery(dataServer)
#     entity_list=['gegu','bankuai']
#     words = erRecovery.recoveryMain("有什么好股可?")#大脸有一,这只股票怎么样?  浙江现xian,怎么,样?,航茶集怎么样呢  爱第二怎么样
#     print(words)
#     # words = erRecovery.recoveryMain("大脸有一,这只股票,怎么样?")
#     # print(words)



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值