基于隐马尔科夫模型的词位标注

import random


def load_data(filepath):
    # 读入语料库
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = f.read()
    except Exception as e:
        print(f"Error in reading file: {e}")
        return [], []

    # 划分训练集和测试集
    data_list = data.strip().split("\n")
    random.shuffle(data_list)
    train_size = int(len(data_list) * 0.8)
    train_data = data_list[:train_size]
    test_data = data_list[train_size:]

    return train_data, test_data


def process_line(line):
    fields = line.strip().split(" ")
    if len(fields) == 2:
        word, pos_tag = fields
    else:
        word, = fields
        pos_tag = "UNKNOWN"
    return word, pos_tag


def count_words_and_pos(train_data):
    # 统计训练集中所有出现过的词和词性
    words = set()
    pos_tags = []
    word2pos = {}
    for line in train_data:
        word, pos_tag = process_line(line)
        words.add(word)
        if pos_tag not in pos_tags:
            pos_tags.append(pos_tag)
        if word not in word2pos:
            word2pos[word] = set()
        word2pos[word].add(pos_tag)

    return words, pos_tags, word2pos


def build_transition_matrix(pos_tags, train_data):
    # 建立状态转移矩阵
    pos_tag2id = {pos_tag: i for i, pos_tag in enumerate(pos_tags)}
    A = [[1e-10] * len(pos_tags) for _ in range(len(pos_tags))]
    prev_pos_tag = None
    for line in train_data:
        word, pos_tag = process_line(line)
        if prev_pos_tag is not None:
            A[pos_tag2id[prev_pos_tag]][pos_tag2id[pos_tag]] += 1
        prev_pos_tag = pos_tag

    for i in range(len(pos_tags)):
        total_count = sum(A[i])
        for j in range(len(pos_tags)):
            A[i][j] /= total_count

    return A, pos_tag2id


def build_observation_matrix(words, pos_tags, train_data, pos_tag2id):
    # 建立观测矩阵
    word2id = {word: i for i, word in enumerate(words)}
    B = [[1e-10] * len(words) for _ in range(len(pos_tags))]
    for line in train_data:
        word, pos_tag = process_line(line)
        B[pos_tag2id[pos_tag]][word2id[word]] += 1

    for i in range(len(pos_tags)):
        total_count = sum(B[i])
        for j in range(len(words)):
            B[i][j] /= total_count

    return B, word2id

def forward(obs_seq, A, B, pos_tag_count):
    T = len(obs_seq)
    alpha = [[0] * pos_tag_count for _ in range(T)]
    for i in range(pos_tag_count):
        alpha[0][i] = A[0][i] * B[i][obs_seq[0]]
    for t in range(1, T):
        for i in range(pos_tag_count):
            alpha[t][i] = sum(alpha[t - 1][j] * A[j][i] * B[i][obs_seq[t]] for j in range(pos_tag_count))
    return alpha

def backward(obs_seq, A, B, pos_tag_count):
    T = len(obs_seq)
    beta = [[0] * pos_tag_count for _ in range(T)]
    for i in range(pos_tag_count):
        beta[T - 1][i] = 1
    for t in range(T - 2, -1, -1):
        for i in range(pos_tag_count):
            beta[t][i] = sum(beta[t + 1][j] * A[i][j] * B[j][obs_seq[t + 1]] for j in range(pos_tag_count))
    return beta

def forward_backward(obs_seq, A, B, pos_tag_count):
    alpha = forward(obs_seq, A, B, pos_tag_count)
    beta = backward(obs_seq, A, B, pos_tag_count)
    gamma = [[alpha[t][i] * beta[t][i] for i in range(pos_tag_count)] for t in range(len(obs_seq))]
    for t in range(len(obs_seq)):
        total = sum(gamma[t])
        for i in range(pos_tag_count):
            gamma[t][i] /= total
    return gamma




def pos_tagging(A, B, pos_tags, word2id):
    while True:
        sentence = input("请输入一句话(按 c 退出):")
        if sentence == "c":
            break
        obs_seq = [word2id.get(word, -1) for word in sentence]
        if -1 in obs_seq:
            print("存在未知单词!")
            continue
        gamma = forward_backward(obs_seq, A, B, len(pos_tags))
        pos_tags_list = [pos_tags[i] for i in [max(range(len(pos_tags)),
                                                   key=lambda i: gamma[t][i]) for t in range(len(obs_seq))]]
        print(" ".join([f"{word}/{pos_tag}" for word, pos_tag in zip(sentence, pos_tags_list)]))



def main():
    filepath = "msr_training.txt"
    train_data, test_data = load_data(filepath)
    words, pos_tags, word2pos = count_words_and_pos(train_data)
    A, pos_tag2id = build_transition_matrix(pos_tags, train_data)
    B, word2id = build_observation_matrix(words, pos_tags, train_data, pos_tag2id)

    correct_count = 0
    total_count = 0
    for line in test_data:
        word, pos_tag = process_line(line)
        obs_seq = [word2id.get(word, -1)]
        if obs_seq[0] == -1:
            continue
        gamma = forward_backward(obs_seq, A, B, len(pos_tags))

        predict_pos_tags = [pos_tags[i] for i in [max(range(len(pos_tags)),
                                                      key=lambda i: gamma[t][i]) for t in range(len(obs_seq))]]
        correct_pos_tags = [pos_tag]
        while len(predict_pos_tags) < len(correct_pos_tags):
            predict_pos_tags.append('UNKNOWN')
        total_count += len(correct_pos_tags)
        correct_count += sum([1 for p, c in zip(predict_pos_tags, correct_pos_tags) if p == c])

    accuracy = correct_count / total_count
    print('准确率为:{:.2%}'.format(accuracy))

    pos_tagging(A, B, pos_tags, word2id)

if __name__ == "__main__":
    main()

程序的主要步骤如下:

  1. 加载数据,划分训练集和测试集:load_data 函数从文件中读取语料数据,然后将其随机打乱,按照 80% 和 20% 的比例划分为训练集和测试集。
  2. 处理每一行数据:process_line 函数接受一个字符串作为输入,返回包含词和对应词性的元组。
  3. 统计词和词性:count_words_and_pos 函数统计训练集中出现的所有词和词性。
  4. 建立状态转移矩阵:build_transition_matrix 函数根据训练数据计算词性之间的转移概率。
  5. 建立观测矩阵:build_observation_matrix 函数根据训练数据计算给定词性的情况下,观测到某个词的概率。
  6. 实现前向算法:forward 函数用于计算前向概率。
  7. 实现后向算法:backward 函数用于计算后向概率。
  8. 实现前向-后向算法:forward_backward 函数结合前向概率和后向概率,计算每个时间步上的词性概率。
  9. 词性标注:pos_tagging 函数实现了一个简单的交互式词性标注功能,用户可以输入一句话,程序会输出每个词的词性。
  10. 主函数:main 函数负责调用上述所有函数,对测试集进行评估,并提供词性标注功能。

程序的大致思路如下:

  1. 加载和处理数据:从文件中读取数据,并将每一行分割成词和对应的词性。这些数据被随机打乱,然后分割成训练集和测试集。

  2. 计数和建立矩阵:对训练集中出现的词和词性进行统计,并建立状态转移矩阵和观测矩阵。状态转移矩阵表示从一个词性转移到另一个词性的概率,观测矩阵表示给定词性下生成某个词的概率。

  3. 实现前向和后向算法:前向算法用于计算在给定观测序列的前提下,模型到达某个特定状态的所有路径的概率之和。后向算法则用于计算给定模型和某个特定状态,从这个状态到观测序列结束的所有路径的概率之和。结合前向和后向算法,可以计算出每个时间步上的词性概率。

  4. 词性标注:在输入的句子中,对每个词进行词性标注。这是通过查找最大的词性概率来实现的。

  5. 评估:在测试集上对模型进行评估,计算准确率。准确率是指正确标注的词性数量占总词数的比例。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值