# 完整项目


import nltk
import sys
from nltk.corpus import brown

print('--------------------------词性标注--------------------------')

brown_tags_words = []

for sent in brown.tagged_sents():
# 首先 添加 开头
brown_tags_words.append(("START", "START"))
# 为了省事儿，我们把tag都省略成前两个字母
brown_tags_words.extend([(tag[:2], word) for (word, tag) in sent])
# 加个结尾
brown_tags_words.append(("END", "END"))

# 条件频率分布
cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
# 条件概率分布
cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

print("The probability of an adjective (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new"))
print("The probability of a verb (VB) being 'duck' is", cpd_tagwords["VB"].prob("duck"))

brown_tags = [tag for (tag, word) in brown_tags_words]

# count(t{i-1} ti)
# bigram的意思是 前后两个一组，联在一起
cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
# P(ti | t{i-1})
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

print()
print("If we have just seen 'DT', the probability of 'NN' is", cpd_tags["DT"].prob("NN"))
print("If we have just seen 'VB', the probability of 'JJ' is", cpd_tags["VB"].prob("DT"))
print("If we have just seen 'VB', the probability of 'NN' is", cpd_tags["VB"].prob("NN"))

'''

P(START) * P(PP|START) * P(I | PP) *
P(VB | PP) * P(want | VB) *
P(TO | VB) * P(to | TO) *
P(VB | TO) * P(race | VB) *
P(END | VB)

'''
print('-------------------问题1-------------------')

prob_tagsequence = cpd_tags["START"].prob("PP") * cpd_tagwords["PP"].prob("I") * \
cpd_tags["PP"].prob("VB") * cpd_tagwords["VB"].prob("want") * \
cpd_tags["VB"].prob("TO") * cpd_tagwords["TO"].prob("to") * \
cpd_tags["TO"].prob("VB") * cpd_tagwords["VB"].prob("race") * \
cpd_tags["VB"].prob("END")

print("The probability of the tag sequence 'START PP VB TO VB END' for 'I want to race' is:", prob_tagsequence)

'''

Viterbi 的实现

'''

print('-------------------问题2-------------------')

# 首先，我们拿出所有独特的tags（也就是tags的全集）
distinct_tags = set(brown_tags)
sentence = ["I", "love", "you"]
sentlen = len(sentence)

# 开始 Viterbi
viterbi = []
# 回溯器
backpointer = []

# 测试
first_viterbi = {}
first_backpointer = {}
for tag in distinct_tags:
if tag == 'START': continue
first_viterbi[tag] = cpd_tags["START"].prob(tag) * cpd_tagwords[tag].prob(sentence[0])
first_backpointer[tag] = "START"

print('first_viterbi -> ', first_viterbi)
print('first_backpointer -> ', first_backpointer)

viterbi.append(first_viterbi)
backpointer.append(first_backpointer)

# 我们可以先看一眼，目前最好的tag是啥
currbest = max(first_viterbi.keys(), key=lambda tag: first_viterbi[tag])
print("Word", "'" + sentence[0] + "'", "current best two-tag sequence:", first_backpointer[currbest], currbest)

for wordindex in range(1, sentlen):
this_viterbi = {}
this_backpointer = {}
prev_viterbi = viterbi[-1]

for tag in distinct_tags:
if tag == "START": continue

best_previous = max(prev_viterbi.keys(),
key=lambda prevtag: \
prev_viterbi[prevtag] * cpd_tags[prevtag].prob(tag) * cpd_tagwords[tag].prob(
sentence[wordindex]))

this_viterbi[tag] = prev_viterbi[best_previous] * \
cpd_tags[best_previous].prob(tag) * cpd_tagwords[tag].prob(sentence[wordindex])
this_backpointer[tag] = best_previous

# 每次找完Y 我们把目前最好的 存一下

currbest = max(this_viterbi.keys(), key=lambda tag: this_viterbi[tag])
print("Word", "'" + sentence[wordindex] + "'", "current best two-tag sequence:", this_backpointer[currbest],
currbest)

viterbi.append(this_viterbi)
backpointer.append(this_backpointer)

# 找所有以END结尾的tag sequence
prev_viterbi = viterbi[-1]
best_previous = max(prev_viterbi.keys(),
key=lambda prevtag: prev_viterbi[prevtag] * cpd_tags[prevtag].prob("END"))

prob_tagsequence = prev_viterbi[best_previous] * cpd_tags[best_previous].prob("END")

# 我们这会儿是倒着存的 因为->好的在后面
best_tagsequence = ["END", best_previous]
# 同理 这里也有倒过来
backpointer.reverse()

current_best_tag = best_previous
for bp in backpointer:
best_tagsequence.append(bp[current_best_tag])
current_best_tag = bp[current_best_tag]

best_tagsequence.reverse()

print('-----显示层-----')
print("The sentence was:", end=" ")
for w in sentence: print(w, end=" ")

print()
print('-----隐含层-----')
print("The best tag sequence is:", end=" ")
for t in best_tagsequence: print(t, end=" ")

print()
print('-----准确率-----')
print("The probability of the best tag sequence is:", prob_tagsequence)



04-11 2190

08-10 1万+

06-22 1万+

01-30 6711

04-07 1271

11-24 1万+

05-21 999

03-04 1323

07-06 133

04-16 1278

#### 【NLP】HMM用于词性标注以及hmmlearn框架使用

©️2020 CSDN 皮肤主题: 鲸 设计师: meimeiellie

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。