基于隐马尔可夫模型的词性标注
demo_hmm_pos.py
from pyhanlp import *
from ch07.pku import PKU199801_TRAIN
HMMPOSTagger = JClass('com.hankcs.hanlp.model.hmm.HMMPOSTagger') # 词性标注器
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') # 词法分析器
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
def train_hmm_pos(corpus,model):
tagger = HMMPOSTagger() # 创建词性标注器
tagger.train(corpus) # 训练
print(', '.join(tagger.tag("他","的","希望","是","希望","上学"))) #预测
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(),tagger) # 构造词法分析器
print(analyzer.analyze("李狗蛋的希望是上学").translateLabels()) # 分词+词性标注
return tagger
if __name__ == '__main__':
train_hmm_pos(PKU199801_TRAIN,FirstOrderHiddenMarkovModel())
train_hmm_pos(PKU199801_TRAIN,SecondOrderHiddenMarkovModel())
pku.py
import os
from tests.test_utility import ensure_data
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
PKU199801 = os.path.join(PKU98, '199801.txt')
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
POS_MODEL = os.path.join(PKU98, 'pos.bin')
NER_MODEL = os.path.join(PKU98, 'ner.bin')
基于感知机的词性标注
demo_perceptron_pos.py
from pyhanlp import *
from ch07.pku import PKU199801_TRAIN, POS_MODEL
POSTrainer = JClass('com.hankcs.hanlp.model.perceptron.POSTrainer')
PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') # 词法分析器
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
def train_perceptron_pos(corpus):
trainer = POSTrainer()
trainer.train(corpus, POS_MODEL) # 训练
tagger = PerceptronPOSTagger(POS_MODEL) # 加载
print(', '.join(tagger.tag("他", "的", "希望", "是", "希望", "上学"))) # 预测
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger) # 构造词法分析器
print(analyzer.analyze("李狗蛋的希望是希望上学")) # 分词+词性标注
return tagger
if __name__ == '__main__':
train_perceptron_pos(PKU199801_TRAIN)
基于条件磁场的词性标注
demo_crf_pos.py
from pyhanlp import *
from ch07.pku import PKU199801_TRAIN,POS_MODEL
CRFPOSTagger = JClass('com.hankcs.hanlp.model.crf.CRFPOSTagger')
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') # 词法分析器
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
def train_crf_pos(corpus):
tagger = CRFPOSTagger(None) # 创建空白标注器
tagger.train(corpus,POS_MODEL) # 训练
tagger = CRFPOSTagger(POS_MODEL) # 加载
print(", ".join(tagger.tag("他","的","希望","是","上学"))) # 预测
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(),tagger) # 构造词法分析器
print(analyzer.analyze("李狗蛋的希望是希望上学")) # 分词+词性标注
return tagger
if __name__ == '__main__':
train_crf_pos(PKU199801_TRAIN)