NLP中文分词-双向匹配算法

 

分类: Python/Ruby

2021-10-06 16:18:01

import time

import re

class Segment:

    # 数据成员

    sentence = ""

    MaxLen = 0

    pos = 0

    len = 0

    result_MM = ""  # 存放MM分词结果

    result_RMM = ""  # 存放RMM分词结果

    final_res = ""

    dict = []

    # 构造函数

    def __init__(self, sentence, MaxLen):

        self.sentence = sentence

        self.MaxLen = MaxLen

        self.pos = 0

        self.len = self.MaxLen

        self.result_MM = ""

        self.readDict()

    # 读字典

    def readDict(self):

        f = open("chineseDic.txt", "r", encoding="utf-8")

        lines = f.readlines()

        for line in lines:

            # print(line)

            words = line.split(",")

            self.dict.append(words[0])

    # 正向最大匹配

    def MM(self, nLen, nPos):

        length = len(self.sentence)

        if (nPos > length):

            return

        substr = self.sentence[nPos:nPos + nLen]

        if substr in self.dict:

            self.result_MM = self.result_MM + substr + "/ "

            nPos = nPos + nLen

            nLen = self.MaxLen

            self.MM(nLen, nPos)

        elif nLen > 1:

            nLen = nLen - 1

            self.MM(nLen, nPos)

        else:

            self.result_MM = self.result_MM + substr + "/ "

            nPos = nPos + 1

            nLen = self.MaxLen

            self.MM(nLen, nPos)

    # 逆向最大匹配

    def RMM(self, nLen, nPos):

        if (nPos < 0):

            return

        substr = self.sentence[nPos - nLen:nPos]

        if substr in self.dict:

            self.result_RMM = self.result_RMM + "/" + substr

            nPos = nPos - nLen

            nLen = self.MaxLen

            self.RMM(nLen, nPos)

        elif nLen > 1:

            nLen = nLen - 1

            self.RMM(nLen, nPos)

        else:

            self.result_RMM = self.result_RMM + substr + "/"

            nPos = nPos - 1

            nLen = self.MaxLen

            self.RMM(nLen, nPos)

    def getMMResult(self):

        return self.result_MM

    def getRMMResult(self):

        return self.result_RMM

    def getFinalResult(self):

        return self.final_res

    def printFinalResult(self):

        print("正向最大匹配结果:")

        seg_res_MM = self.result_MM.replace(" ", "")

        print(seg_res_MM)

        seg_list_MM = seg_res_MM.split('/')

        del seg_list_MM[-1]  # 外汇跟单gendan5.com由于按照'/'分割,所以最后会多出一个'',删去

        print(seg_list_MM)

        print("逆向最大匹配结果:")

        seg_res_RMM = self.result_RMM.replace(" ", "")

        print(seg_res_RMM)

        seg_list_RMM = list(reversed(seg_res_RMM.split('/')))

        del seg_list_RMM[0]

        del seg_list_RMM[-1]

        print(seg_list_RMM)

        len_MM = len(seg_list_MM)

        len_RMM = len(seg_list_RMM)

        flag = 1

        for i in range(0, min(len_MM, len_RMM)):

            if seg_list_MM[i] != seg_list_RMM[i]:

                print("两次分词结果不一致。")

                flag = 0

                break

        if (flag):

            print("两次分词结果一致。")

            print("最终的分词结果为:")

            self.final_res = self.result_MM

            print(self.final_res)

def to_region(segmentation):

    region = []

    start = 1

    for word in re.compile("\\s+").split(segmentation.strip()):  # 空格,回车,换行等空白符

        end = start + len(word) - 2

        region.append((start, end))

        start = end + 1

    return region

def PRF(target, pred):

    t_set, p_set = set(target), set(pred)

    target_num = len(t_set)

    pred_num = len(p_set)

    cap_num = len(t_set & p_set)

    p = cap_num / pred_num

    r = cap_num / target_num

    f = 2 * p * r / (p + r)

    print("P =", p)

    print("R =", r)

    print("F1 =", f)

if __name__ == '__main__':

    test_str = '在这一年中,中国的改革开放和现代化建设继续向前迈进。国民经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。'

    seg = Segment(test_str, 3)

    time_start = time.time()

    seg.MM(3, 0)

    seg.RMM(3, len(test_str))

    time_end = time.time()

    seg.printFinalResult()

    print('分词时间:', time_end - time_start, 's')

    target_str = "在/  这/  一/  年/  中/  ,/  中国/  的/  改革/  开放/  和/  现代化/  建设/  继续/  向前/  迈进/  。/  国民经济/  保持/  了/  “/  高/  增长/  、/  低/  通胀/  ”/  的/  良好/  发展/  态势/  。/  农业/  生产/  再次/  获得/  好/  的/  收成/  ,/  企业/  改革/  继续/  深化/  ,/  人民/  生活/  进一步/  改善/  。/  对外/  经济/  技术/  合作/  与/  交流/  不断/  扩大/  。/"

    re_pred = to_region(seg.getFinalResult())

    re_target = to_region(target_str)

    # 每个单词按它在文本中的起止位置可记作区间[i, j]

    print("分词结果:", re_pred)

    print("标准答案:", re_target)

    PRF(re_target, re_pred)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值