隐马尔可夫模型分词

hmm_cws.py

from pyhanlp import *
from ch03.msr import msr_dict, msr_train, msr_model, msr_test, msr_output, msr_gold

FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
HMMSegmenter = JClass('com.hankcs.hanlp.model.hmm.HMMSegmenter')
CWSEvaluator = SafeJClass('com.hankcs.hanlp.seg.common.CWSEvaluator')

def train(corpus,model):
    segmenter = HMMSegmenter(model)
    segmenter.train(corpus)
    # print(segmenter.segment('商品和服务'))
    text_path = "./data.txt"
    big_text = ""
    with open(text_path,encoding='utf-8') as src:
        big_text += "".join(src.readlines())
        print(big_text)
        print(segmenter.segment(big_text))
    return segmenter.toSegment()

def evaluate(segment):
    result = CWSEvaluator.evaluate(segment,msr_test,msr_output,msr_gold,msr_dict)
    print(result)

if __name__ == '__main__':

    segment = train(msr_train, FirstOrderHiddenMarkovModel())
    evaluate(segment)
    segment = train(msr_train, SecondOrderHiddenMarkovModel())
    evaluate(segment)

msr.py

import os

from tests.test_utility import ensure_data, test_data_path

sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8')
msr_model = os.path.join(test_data_path(), 'msr_cws')
msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt')
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值