词性标注-利用HMM_完整项目_CodingPark编程公园

文章介绍

本文主要讲述使用HMM进行词性标注

完整项目

这里我们用NLTK自带的Brown词库进行学习。


import nltk
import sys
from nltk.corpus import brown

print('--------------------------词性标注--------------------------')


brown_tags_words = []

for sent in brown.tagged_sents():
    # 首先 添加 开头
    brown_tags_words.append(("START", "START"))
    # 为了省事儿,我们把tag都省略成前两个字母
    brown_tags_words.extend([(tag[:2], word) for (word, tag) in sent])
    # 加个结尾
    brown_tags_words.append(("END", "END"))

# 条件频率分布
cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
# 条件概率分布
cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

print("The probability of an adjective (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new"))
print("The probability of a verb (VB) being 'duck' is", cpd_tagwords["VB"].prob("duck"))

brown_tags = [tag for (tag, word) in brown_tags_words]

# count(t{i-1} ti)
# bigram的意思是 前后两个一组,联在一起
cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
# P(ti | t{i-1})
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

print()
print("If we have just seen 'DT', the probability of 'NN' is", cpd_tags["DT"].prob("NN"))
print("If we have just seen 'VB', the probability of 'JJ' is", cpd_tags["VB"].prob("DT"))
print("If we have just seen 'VB', the probability of 'NN' is", cpd_tags["VB"].prob("NN"))

'''

问题1
比如, 一句话,"I want to race", 一套tag,"PP VB TO VB"
他们之间的匹配度有多高呢?



P(START) * P(PP|START) * P(I | PP) *
            P(VB | PP) * P(want | VB) *
            P(TO | VB) * P(to | TO) *
            P(VB | TO) * P(race | VB) *
            P(END | VB)

'''
print('-------------------问题1-------------------')

prob_tagsequence = cpd_tags["START"].prob("PP") * cpd_tagwords["PP"].prob("I") * \
                   cpd_tags["PP"].prob("VB") * cpd_tagwords["VB"].prob("want") * \
                   cpd_tags["VB"].prob("TO") * cpd_tagwords["TO"].prob("to") * \
                   cpd_tags["TO"].prob("VB") * cpd_tagwords["VB"].prob("race") * \
                   cpd_tags["VB"].prob("END")

print("The probability of the tag sequence 'START PP VB TO VB END' for 'I want to race' is:", prob_tagsequence)

'''

问题2
Viterbi 的实现
如果我们手上有一句话,怎么知道最符合的tag是哪组呢?

'''

print('-------------------问题2-------------------')

# 首先,我们拿出所有独特的tags(也就是tags的全集)
distinct_tags = set(brown_tags)
sentence = ["I", "love", "you"]
sentlen = len(sentence)

# 开始 Viterbi
viterbi = []
# 回溯器
backpointer = []

# 测试
first_viterbi = {}
first_backpointer = {}
for tag in distinct_tags:
    if tag == 'START': continue
    first_viterbi[tag] = cpd_tags["START"].prob(tag) * cpd_tagwords[tag].prob(sentence[0])
    first_backpointer[tag] = "START"

print('first_viterbi -> ', first_viterbi)
print('first_backpointer -> ', first_backpointer)

viterbi.append(first_viterbi)
backpointer.append(first_backpointer)

# 我们可以先看一眼,目前最好的tag是啥
currbest = max(first_viterbi.keys(), key=lambda tag: first_viterbi[tag])
print("Word", "'" + sentence[0] + "'", "current best two-tag sequence:", first_backpointer[currbest], currbest)

for wordindex in range(1, sentlen):
    this_viterbi = {}
    this_backpointer = {}
    prev_viterbi = viterbi[-1]

    for tag in distinct_tags:
        if tag == "START": continue

        best_previous = max(prev_viterbi.keys(),
                            key=lambda prevtag: \
                                prev_viterbi[prevtag] * cpd_tags[prevtag].prob(tag) * cpd_tagwords[tag].prob(
                                    sentence[wordindex]))

        this_viterbi[tag] = prev_viterbi[best_previous] * \
                            cpd_tags[best_previous].prob(tag) * cpd_tagwords[tag].prob(sentence[wordindex])
        this_backpointer[tag] = best_previous

    # 每次找完Y 我们把目前最好的 存一下

    currbest = max(this_viterbi.keys(), key=lambda tag: this_viterbi[tag])
    print("Word", "'" + sentence[wordindex] + "'", "current best two-tag sequence:", this_backpointer[currbest],
          currbest)

    viterbi.append(this_viterbi)
    backpointer.append(this_backpointer)

# 找所有以END结尾的tag sequence
prev_viterbi = viterbi[-1]
best_previous = max(prev_viterbi.keys(),
                    key=lambda prevtag: prev_viterbi[prevtag] * cpd_tags[prevtag].prob("END"))

prob_tagsequence = prev_viterbi[best_previous] * cpd_tags[best_previous].prob("END")

# 我们这会儿是倒着存的 因为->好的在后面
best_tagsequence = ["END", best_previous]
# 同理 这里也有倒过来
backpointer.reverse()

current_best_tag = best_previous
for bp in backpointer:
    best_tagsequence.append(bp[current_best_tag])
    current_best_tag = bp[current_best_tag]

best_tagsequence.reverse()

print('-----显示层-----')
print("The sentence was:", end=" ")
for w in sentence: print(w, end=" ")

print()
print('-----隐含层-----')
print("The best tag sequence is:", end=" ")
for t in best_tagsequence: print(t, end=" ")

print()
print('-----准确率-----')
print("The probability of the best tag sequence is:", prob_tagsequence)





结果展示
在这里插入图片描述

在这里插入图片描述

评论将由博主筛选后显示,对所有人可见 | 还能输入1000个字符
©️2020 CSDN 皮肤主题: 鲸 设计师: meimeiellie 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值