词性标注pos_tagging

6 篇文章 0 订阅
4 篇文章 0 订阅
import numpy as np

#1.构造词典、词性库

tag2id,id2tag = {},{}
word2id ,id2word = {},{}

for line in open("traindata.txt"):
    items = line.split("/")
    tag,word = items[1].rstrip(),items[0]

    if tag not in tag2id:
        tag2id[tag] = len(tag2id)
        id2tag[len(tag2id)] = tag
    if word not in word2id:
        word2id[word] = len(word2id)
        id2word[len(word2id)] = word

M = len(word2id) #the numbers of the words in dictionary
N = len(tag2id) #kinds of the tags
# print(M,N)

#2. 构建A,B,Pi

A = np.zeros((N,M)) #A[i][j]:给定tag[i],出现单词j的概率 M:numbers of words,N:kinds of tags
B = np.zeros((N,N)) #B[i][j]:之前状态i转换成状态j的概率,N:kinds of tags
pi = np.zeros(N) #每个单词出现在句首的概率

pre_tag = " "
for line in open("traindata.txt"):
    items = line.split("/")
    wordID,tagID = word2id[items[0]],tag2id[items[1].rstrip()]
    if pre_tag == " ":
        pi[tagID] +=1
        A[tagID][wordID] +=1
    else:
        A[tagID][wordID] +=1
        B[tag2id[pre_tag]][tagID] +=1

    if items[0] == ".":
        pre_tag = " "
    else:
        pre_tag = items[1].rstrip()

#Normalize
pi = pi/sum(pi)
for i in range(N):
    A[i] = A[i]/sum(A[i])
    B[i] = B[i]/sum(B[i])
# print(pi)

def log(v):
    if v == 0:
        return np.log(v+0.0001)
    else:
        return np.log(v)

def viterbi(x,pi,A,B):
    """
    :param x: user input string/sentence x:"I like playing basketball"
    :param pi: initial probability of tags
    :param A: 给定tag,每个单词出现的概率
    :param B: tag之间的转移概率
    :return:
    """

    x = [word2id[word] for word in x.split(" ")]

    T = len(x)
    dp = np.zeros((N,T))

    ptr = np.array([[0 for i in range(T)] for j in range(N)])
    for i in range(N):
        dp[i][0] = log(pi[i]) +log(A[i][x[0]])

    for i in range(N):
        for j in range(1,T):
            dp[i][j] = -999999
            for k in range(N):
                score = dp[k][j-1] + log(A[i][x[j]]) + log(B[i][k])
                if score > dp[i][j]:
                    dp[i][j] = score
                    ptr[i][j] = k

    #decoding 把最好的序列打印出来
    best_seq = [0]*T
    #step1 找出对应最后一个单词的词性
    best_seq[T-1] = np.argmax(dp[:,T-1])
    # print(dp)
    #step2 从后到前循环,依次求出每个单词的词性
    for i in range(T-2,-1,-1): #T-1,T-2,...,2,1,0
        best_seq[i] = ptr[best_seq[i+1]][i+1]
    #best_seq存放x对应的词性
    for i in range(len(best_seq)):
        print(id2tag[best_seq[i]])

x = "social security number , passport number and details about the services"
viterbi(x,pi,A,B)



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
import jieba import pynlpir import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split # 读取文本文件 with open('1.txt', 'r', encoding='utf-8') as f: text = f.read() # 对文本进行分词 word_list = list(jieba.cut(text, cut_all=False)) # 打开pynlpir分词器 pynlpir.open() # 对分词后的词语进行词性标注 pos_list = pynlpir.segment(text, pos_tagging=True) # 将词汇表映射成整数编号 vocab = set(word_list) vocab_size = len(vocab) word_to_int = {word: i for i, word in enumerate(vocab)} int_to_word = {i: word for i, word in enumerate(vocab)} # 将词语和词性标记映射成整数编号 pos_tags = set(pos for word, pos in pos_list) num_tags = len(pos_tags) tag_to_int = {tag: i for i, tag in enumerate(pos_tags)} int_to_tag = {i: tag for i, tag in enumerate(pos_tags)} # 将文本和标签转换成整数序列 X = np.array([word_to_int[word] for word in word_list]) y = np.array([tag_to_int[pos] for word, pos in pos_list]) # 将数据划分成训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 定义模型参数 embedding_size = 128 rnn_size = 256 batch_size = 128 epochs = 10 # 定义RNN模型 model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_size), tf.keras.layers.SimpleRNN(rnn_size), tf.keras.layers.Dense(num_tags, activation='softmax') ]) # 编译模型 model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # 训练模型 model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test)) # 对测试集进行预测 y_pred = model.predict(X_test) y_pred = np.argmax(y_pred, axis=1) # 计算模型准确率 accuracy = np.mean(y_pred == y_test) print('Accuracy: {:.2f}%'.format(accuracy * 100)) # 将模型保存到文件中 model.save('model.h5')出现下述问题:ValueError: Found input variables with inconsistent numbers of samples:
06-07

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值