词性标注

基于隐马尔可夫模型的词性标注

demo_hmm_pos.py

from pyhanlp import *
from ch07.pku import PKU199801_TRAIN

HMMPOSTagger = JClass('com.hankcs.hanlp.model.hmm.HMMPOSTagger') # 词性标注器
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') # 词法分析器
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
def train_hmm_pos(corpus,model):
    tagger = HMMPOSTagger() # 创建词性标注器
    tagger.train(corpus) # 训练
    print(', '.join(tagger.tag("他","的","希望","是","希望","上学"))) #预测
    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(),tagger) # 构造词法分析器
    print(analyzer.analyze("李狗蛋的希望是上学").translateLabels()) # 分词+词性标注
    return tagger

if __name__ == '__main__':
    train_hmm_pos(PKU199801_TRAIN,FirstOrderHiddenMarkovModel())
    train_hmm_pos(PKU199801_TRAIN,SecondOrderHiddenMarkovModel())

pku.py

import os

from tests.test_utility import ensure_data

PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
PKU199801 = os.path.join(PKU98, '199801.txt')
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
POS_MODEL = os.path.join(PKU98, 'pos.bin')
NER_MODEL = os.path.join(PKU98, 'ner.bin')

在这里插入图片描述

基于感知机的词性标注

demo_perceptron_pos.py

from pyhanlp import *
from ch07.pku import PKU199801_TRAIN, POS_MODEL

POSTrainer = JClass('com.hankcs.hanlp.model.perceptron.POSTrainer')
PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') # 词法分析器
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')

def train_perceptron_pos(corpus):
    trainer = POSTrainer()
    trainer.train(corpus, POS_MODEL)  # 训练
    tagger = PerceptronPOSTagger(POS_MODEL)  # 加载
    print(', '.join(tagger.tag("他", "的", "希望", "是", "希望", "上学")))  # 预测
    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger)  # 构造词法分析器
    print(analyzer.analyze("李狗蛋的希望是希望上学"))  # 分词+词性标注
    return tagger


if __name__ == '__main__':
    train_perceptron_pos(PKU199801_TRAIN)

基于条件磁场的词性标注

demo_crf_pos.py

from pyhanlp import *
from ch07.pku import PKU199801_TRAIN,POS_MODEL

CRFPOSTagger = JClass('com.hankcs.hanlp.model.crf.CRFPOSTagger')
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') # 词法分析器
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')

def train_crf_pos(corpus):
    tagger = CRFPOSTagger(None) # 创建空白标注器
    tagger.train(corpus,POS_MODEL) # 训练
    tagger = CRFPOSTagger(POS_MODEL) # 加载
    print(", ".join(tagger.tag("他","的","希望","是","上学"))) # 预测
    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(),tagger) # 构造词法分析器
    print(analyzer.analyze("李狗蛋的希望是希望上学")) # 分词+词性标注
    return tagger

if __name__ == '__main__':
    train_crf_pos(PKU199801_TRAIN)
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值