词性标注pos_tagging

smallTutou

于 2022-04-08 15:14:36 发布

阅读量307

点赞数

分类专栏： nlp Python 数据结构

本文链接：https://blog.csdn.net/smallTutou/article/details/124042223

版权

python nlp

Python 同时被 3 个专栏收录

16 篇文章 0 订阅

订阅专栏

数据结构

6 篇文章 0 订阅

订阅专栏

nlp

4 篇文章 0 订阅

订阅专栏

import numpy as np

#1.构造词典、词性库

tag2id,id2tag = {},{}
word2id ,id2word = {},{}

for line in open("traindata.txt"):
    items = line.split("/")
    tag,word = items[1].rstrip(),items[0]

    if tag not in tag2id:
        tag2id[tag] = len(tag2id)
        id2tag[len(tag2id)] = tag
    if word not in word2id:
        word2id[word] = len(word2id)
        id2word[len(word2id)] = word

M = len(word2id) #the numbers of the words in dictionary
N = len(tag2id) #kinds of the tags
# print(M,N)

#2. 构建A,B,Pi

A = np.zeros((N,M)) #A[i][j]:给定tag[i],出现单词j的概率 M:numbers of words,N:kinds of tags
B = np.zeros((N,N)) #B[i][j]:之前状态i转换成状态j的概率，N：kinds of tags
pi = np.zeros(N) #每个单词出现在句首的概率

pre_tag = " "
for line in open("traindata.txt"):
    items = line.split("/")
    wordID,tagID = word2id[items[0]],tag2id[items[1].rstrip()]
    if pre_tag == " ":
        pi[tagID] +=1
        A[tagID][wordID] +=1
    else:
        A[tagID][wordID] +=1
        B[tag2id[pre_tag]][tagID] +=1

    if items[0] == ".":
        pre_tag = " "
    else:
        pre_tag = items[1].rstrip()

#Normalize
pi = pi/sum(pi)
for i in range(N):
    A[i] = A[i]/sum(A[i])
    B[i] = B[i]/sum(B[i])
# print(pi)

def log(v):
    if v == 0:
        return np.log(v+0.0001)
    else:
        return np.log(v)

def viterbi(x,pi,A,B):
    """
    :param x: user input string/sentence x:"I like playing basketball"
    :param pi: initial probability of tags
    :param A: 给定tag，每个单词出现的概率
    :param B: tag之间的转移概率
    :return:
    """

    x = [word2id[word] for word in x.split(" ")]

    T = len(x)
    dp = np.zeros((N,T))

    ptr = np.array([[0 for i in range(T)] for j in range(N)])
    for i in range(N):
        dp[i][0] = log(pi[i]) +log(A[i][x[0]])

    for i in range(N):
        for j in range(1,T):
            dp[i][j] = -999999
            for k in range(N):
                score = dp[k][j-1] + log(A[i][x[j]]) + log(B[i][k])
                if score > dp[i][j]:
                    dp[i][j] = score
                    ptr[i][j] = k

    #decoding 把最好的序列打印出来
    best_seq = [0]*T
    #step1 找出对应最后一个单词的词性
    best_seq[T-1] = np.argmax(dp[:,T-1])
    # print(dp)
    #step2 从后到前循环，依次求出每个单词的词性
    for i in range(T-2,-1,-1): #T-1,T-2,...,2,1,0
        best_seq[i] = ptr[best_seq[i+1]][i+1]
    #best_seq存放x对应的词性
    for i in range(len(best_seq)):
        print(id2tag[best_seq[i]])

x = "social security number , passport number and details about the services"
viterbi(x,pi,A,B)

smallTutou

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
词性标注pos_tagging

import numpy as np#1.构造词典、词性库tag2id,id2tag = {},{}word2id ,id2word = {},{}for line in open("traindata.txt"): items = line.split("/") tag,word = items[1].rstrip(),items[0] if tag not in tag2id: tag2id[tag] = len(tag2id) i
复制链接

扫一扫

专栏目录