hmm_cws.py
from pyhanlp import *
from ch03.msr import msr_dict, msr_train, msr_model, msr_test, msr_output, msr_gold
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
HMMSegmenter = JClass('com.hankcs.hanlp.model.hmm.HMMSegmenter')
CWSEvaluator = SafeJClass('com.hankcs.hanlp.seg.common.CWSEvaluator')
def train(corpus,model):
segmenter = HMMSegmenter(model)
segmenter.train(corpus)
# print(segmenter.segment('商品和服务'))
text_path = "./data.txt"
big_text = ""
with open(text_path,encoding='utf-8') as src:
big_text += "".join(src.readlines())
print(big_text)
print(segmenter.segment(big_text))
return segmenter.toSegment()
def evaluate(segment):
result = CWSEvaluator.evaluate(segment,msr_test,msr_output,msr_gold,msr_dict)
print(result)
if __name__ == '__main__':
segment = train(msr_train, FirstOrderHiddenMarkovModel())
evaluate(segment)
segment = train(msr_train, SecondOrderHiddenMarkovModel())
evaluate(segment)
msr.py
import os
from tests.test_utility import ensure_data, test_data_path
sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8')
msr_model = os.path.join(test_data_path(), 'msr_cws')
msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt')
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')