条件随机场分词模型python实现

过程不累赘,用的是动态规划的方法。有空再写,直接贴代码。
代码有些地方可以继续优化。如果有不合理之处,请多多指教
可供参考,禁止抄袭,大家一起学习,一起进步


import copy

class Model:
    def __init__(self,sentence):
        self.sentence=list(sentence)
        self.state=['B','M','E','S']
        self.count_one=[x for x in range(len(self.sentence))] #统计词语出现的次数
        self.fp = open("./data/msr_training.txt", 'r', encoding="UTF-8")
        self.word_P=[[0,0,0,0] for x in range(len(self.sentence))]
        self.word_P_fuben=None
        self.Dist_1=None #所有字前后概率
        self.Dist_2=None

    # 单个词作为4种状态的数量和概率
    def statisticsP(self):
        self.fp.seek(0)
        lines = self.fp.readlines()
        # 单个词的概率
        for line in lines:
            line = line.strip().split("|")
            if (line[0] in self.sentence):
                if line[1] in self.state:
                    x1 =  self.sentence.index(line[0])
                    x2 = self.state.index(line[1])
                    self.word_P[x1][x2] += 1
                    self.count_one[x1] += 1
        self.word_P_fuben = copy.deepcopy(self.word_P)
        for x in  self.word_P:
            temp = sum(x)
            for xx in range(len(x)):
                x[xx] = x[xx] / (temp + 0.1)
            temp = 0

    def statisticsW(self):
        word_W = [[[0, 0] for x in range(4)] for x in range(len(self.sentence))]
        self.Dist_1 = dict(zip(self.sentence, word_W))
        self.fp.seek(0)
        w1 = None
        w2 = None
        w3 = None
        lines = self.fp.readlines()
        # 前后概率
        for j in range(len(lines) - 1):
            line = lines[j].strip().split("|")
            if len(line) < 2: continue
            if (line[0] in self.sentence):
                x1 = self.state.index(line[1])
                i = self.sentence.index(line[0])
                if i == 0:
                    w1 = None
                    w2 = self.sentence[i]
                    w3 = self.sentence[i + 1]
                elif i == len(self.sentence) - 1:
                    w1 = self.sentence[i - 1]
                    w2 = self.sentence[i]
                    w3 = None
                else:
                    w1 = self.sentence[i - 1]
                    w2 = self.sentence[i]
                    w3 = self.sentence[i + 1]

                # 看前
                line_font = lines[j - 1].strip().split("|")
                x2 = self.state.index(line[1])
                if line_font[0] == w1 and line_font[1]==x2:
                    # x2 = self.state.index(line_font[1])
                    self.Dist_1[w2][x2][0] += 1
                # # 看后
                line_font_1 = lines[j + 1].strip().split("|")
                if line_font_1[0] == w3 and line_font_1[1]==x2:
                    # x3 = self.state.index(line_font_1[1])
                    self.Dist_1[w2][x2][1] += 1
        k = 0
        for x in self.Dist_1:
            for i in range(4):
                for j in range(2):
                    self.Dist_1[x][i][j] = self.Dist_1[x][i][j] / (self.count_one[k])
            k += 1


    def statisticsneststate(self):
        self.fp.seek(0)
        word_neststate = [[[0, 0, 0, 0] for x in range(4)] for x in range(len(self.sentence))]
        self.Dist_2 = dict(zip(self.sentence, word_neststate))
        Dist_2_fuben = None
        lines = self.fp.readlines()
        for j in range(len(lines) - 1):
            line = lines[j].strip().split('|')
            if line[0] in self.sentence:
                line_next = lines[j +1].strip().split('|')
                x1 = self.state.index(line[1])
                if line_next[1] in self.state:
                    # print(line_next[1])
                    x2 = self.state.index(line_next[1])
                    self.Dist_2[line[0]][x1][x2] += 1
        Dist_2_fuben = copy.deepcopy(self.Dist_2)
        k=0
        for x in self.Dist_2:
            for i in range(4):
                for j in range(4):
                    self.Dist_2[x][i][j] = self.Dist_2[x][i][j] / (self.count_one[k])
            k+=1
    def Cutsentence(self):
        print("正在训练.....")
        self.statisticsP()
        self.statisticsW()
        self.statisticsneststate()
        self.fp.close()
        word_matrix = [[0 for i in range(0, 4)] for j in range(len(self.sentence))]
        word_path = []
        word_matrix[0][0] = self.word_P[0][0] + sum(self.Dist_1[self.sentence[0]][0])
        word_matrix[0][1] = self.word_P[0][1] + sum(self.Dist_1[self.sentence[0]][1])
        word_matrix[0][2] = self.word_P[0][2] + sum(self.Dist_1[self.sentence[0]][2])
        word_matrix[0][3] = self.word_P[0][3] + sum(self.Dist_1[self.sentence[0]][3])
        word_path.append([0, 0, 0, 0])
        for i in range(1, len(self.sentence)):
            Stemp_2 = []
            for j in range(4):
                Stemp_1 = []
                for z in range(4):
                    M1 = self.Dist_2[self.sentence[i-1]][z][j] * word_matrix[i - 1][z] + sum(self.Dist_1[self.sentence[i]][j]) + self.word_P[i][j]
                    Stemp_1.append(M1)

                word_matrix[i][j] = max(Stemp_1)
                Stemp_2.append(Stemp_1.index(max(Stemp_1)))
            word_path.append(Stemp_2)
        max_index = word_matrix[len(self.sentence) - 1].index(max(word_matrix[len(self.sentence) - 1]))
        res_path = []
        for i in range(len(word_path) - 1, -1, -1):
            res_path.append(self.state[max_index])
            max_index = word_path[i][max_index]

        res_path.reverse()
        print("分词结果:")
        for i in range(len(res_path)):
            print(self.sentence[i], "/", res_path[i], end=" ")
#调用
import Cutwordmodel
model=Cutwordmodel.Model("中国的经济发展较快速")
model.Cutsentence()

在这里插入图片描述
在这里插入图片描述

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
条件随机场(CRF)是一种用于序列标注的机器学习算法。它可以用于自然语言处理中的文本标注任务。在Python中,有一些开源的库可以用来实现条件随机场算法,例如CRFsuite和sklearn-crfsuite。这些库提供了CRF模型实现,以及相关的调用示例和使用方法。你可以使用这些库来构建和训练条件随机场模型,进行序列标注任务。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [crf 条件随机场 python代码](https://download.csdn.net/download/katrina1rani/12589681)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 33.333333333333336%"] - *2* [【Python机器学习】条件随机场模型CRF及在中文分词中实战(附源码和数据集)](https://blog.csdn.net/jiebaoshayebuhui/article/details/128401089)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 33.333333333333336%"] - *3* [【机器学习】【条件随机场CRF-1】CRF的矩阵形式表示的示例讲解 + Python实现](https://blog.csdn.net/u012421852/article/details/80287567)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 33.333333333333336%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值