1.数据准备
from utils_pos import get_word_tag, preprocess
import pandas as pd
from collections import defaultdict
import math
import numpy as np
with open("WSJ_02-21.pos", 'r') as f:
training_corpus = f.readlines()
with open("hmm_vocab.txt", 'r') as f:
voc_l = f.read().split('\n')
#生成index字典
vocab = {}
# Get the index of the corresponding words.
for i, word in enumerate(sorted(voc_l)):
vocab[word] = i
print("Vocabulary dictionary, key is the word, value is a unique integer")
cnt = 0
for k,v in vocab.items():
print(f"{k}:{v}")
cnt += 1
if cnt > 20:
break
with open("WSJ_24.pos", 'r') as f:
y = f.readlines()
#测试集内容格式:'economy\tNN\n', "'s\tPOS\n", 'temperature\tNN\n'
#corpus without tags, preprocessed
_, prep = preprocess(vocab, "test.words")
2.HMM模型的训练
计算转移矩阵和观测矩阵
def create_dictionaries(training_corpus, vocab):
"""
Input:
training_corpus: a corpus where each line has a word followed by its tag.
vocab: a dictionary where keys are words in vocabulary and value is an index
Output:
emission_counts: a dictionary where the keys are (tag, word) and the values are the counts
transition_counts: a dictionary where the keys are (prev_tag, tag) and the values are the counts
tag_counts: a dictionary where the keys are the tags and the values are the counts
"""
# initialize the dictionaries using defaultdict
emission_counts = defaultdict(int)
transition_counts = defaultdict(int)
tag_counts = defaultdict(int)
# Initialize "prev_tag" (previous tag) with the start state, denoted by '--s--'
prev_tag = '--s--'
# use 'i' to track the line number in the corpus
i = 0
# Each item in the training corpus contains a word and its POS tag
# Go through each word and its tag in the training corpus
for word_tag in training_corpus:
# Increment the word_tag count
i += 1
# Every 50,000 words, print the word count
if i % 50000 == 0:
print(f"word count = {i}")
### START CODE HERE (Replace instances of 'None' with your code) ###
# get the word and tag using the get_word_tag helper function (imported from utils_pos.py)
word, tag = get_word_tag(word_tag,vocab)
# Increment the transition count for the previous word and tag
transition_counts[(prev_tag, tag)] += 1
# Increment the emission count for the tag and word
emission_counts[(tag, word)] += 1
# Increment the tag count
tag_counts[tag] += 1
# Set the previous tag to this tag (for the next iteration of the loop)
prev_tag = tag
### END CODE HERE ###
return emission_counts, transition_counts, tag_counts
def create_transition_matrix(alpha, tag_counts, transition_counts):
'''
Input:
alpha: number used for smoothing
tag_coun