import numpy as np
#1.构造词典、词性库
tag2id,id2tag = {},{}
word2id ,id2word = {},{}
for line in open("traindata.txt"):
items = line.split("/")
tag,word = items[1].rstrip(),items[0]
if tag not in tag2id:
tag2id[tag] = len(tag2id)
id2tag[len(tag2id)] = tag
if word not in word2id:
word2id[word] = len(word2id)
id2word[len(word2id)] = word
M = len(word2id) #the numbers of the words in dictionary
N = len(tag2id) #kinds of the tags
# print(M,N)
#2. 构建A,B,Pi
A = np.zeros((N,M)) #A[i][j]:给定tag[i],出现单词j的概率 M:numbers of words,N:kinds of tags
B = np.zeros((N,N)) #B[i][j]:之前状态i转换成状态j的概率,N:kinds of tags
pi = np.zeros(N) #每个单词出现在句首的概率
pre_tag = " "
for line in open("traindata.txt"):
items = line.split("/")
wordID,tagID = word2id[items[0]],tag2id[items[1].rstrip()]
if pre_tag == " ":
pi[tagID] +=1
A[tagID][wordID] +=1
else:
A[tagID][wordID] +=1
B[tag2id[pre_tag]][tagID] +=1
if items[0] == ".":
pre_tag = " "
else:
pre_tag = items[1].rstrip()
#Normalize
pi = pi/sum(pi)
for i in range(N):
A[i] = A[i]/sum(A[i])
B[i] = B[i]/sum(B[i])
# print(pi)
def log(v):
if v == 0:
return np.log(v+0.0001)
else:
return np.log(v)
def viterbi(x,pi,A,B):
"""
:param x: user input string/sentence x:"I like playing basketball"
:param pi: initial probability of tags
:param A: 给定tag,每个单词出现的概率
:param B: tag之间的转移概率
:return:
"""
x = [word2id[word] for word in x.split(" ")]
T = len(x)
dp = np.zeros((N,T))
ptr = np.array([[0 for i in range(T)] for j in range(N)])
for i in range(N):
dp[i][0] = log(pi[i]) +log(A[i][x[0]])
for i in range(N):
for j in range(1,T):
dp[i][j] = -999999
for k in range(N):
score = dp[k][j-1] + log(A[i][x[j]]) + log(B[i][k])
if score > dp[i][j]:
dp[i][j] = score
ptr[i][j] = k
#decoding 把最好的序列打印出来
best_seq = [0]*T
#step1 找出对应最后一个单词的词性
best_seq[T-1] = np.argmax(dp[:,T-1])
# print(dp)
#step2 从后到前循环,依次求出每个单词的词性
for i in range(T-2,-1,-1): #T-1,T-2,...,2,1,0
best_seq[i] = ptr[best_seq[i+1]][i+1]
#best_seq存放x对应的词性
for i in range(len(best_seq)):
print(id2tag[best_seq[i]])
x = "social security number , passport number and details about the services"
viterbi(x,pi,A,B)
词性标注pos_tagging
最新推荐文章于 2023-07-25 01:01:22 发布