import random
def load_data(filepath):
# 读入语料库
try:
with open(filepath, "r", encoding="utf-8") as f:
data = f.read()
except Exception as e:
print(f"Error in reading file: {e}")
return [], []
# 划分训练集和测试集
data_list = data.strip().split("\n")
random.shuffle(data_list)
train_size = int(len(data_list) * 0.8)
train_data = data_list[:train_size]
test_data = data_list[train_size:]
return train_data, test_data
def process_line(line):
fields = line.strip().split(" ")
if len(fields) == 2:
word, pos_tag = fields
else:
word, = fields
pos_tag = "UNKNOWN"
return word, pos_tag
def count_words_and_pos(train_data):
# 统计训练集中所有出现过的词和词性
words = set()
pos_tags = []
word2pos = {}
for line in train_data:
word, pos_tag = process_line(line)
words.add(word)
if pos_tag not in pos_tags:
pos_tags.append(pos_tag)
if word not in word2pos:
word2pos[word] = set()
word2pos[word].add(pos_tag)
return words, pos_tags, word2pos
def build_transition_matrix(pos_tags, train_data):
# 建立状态转移矩阵
pos_tag2id = {pos_tag: i for i, pos_tag in enumerate(pos_tags)}
A = [[1e-10] * len(pos_tags) for _ in range(len(pos_tags))]
prev_pos_tag = None
for line in train_data:
word, pos_tag = process_line(line)
if prev_pos_tag is not None:
A[pos_tag2id[prev_pos_tag]][pos_tag2id[pos_tag]] += 1
prev_pos_tag = pos_tag
for i in range(len(pos_tags)):
total_count = sum(A[i])
for j in range(len(pos_tags)):
A[i][j] /= total_count
return A, pos_tag2id
def build_observation_matrix(words, pos_tags, train_data, pos_tag2id):
# 建立观测矩阵
word2id = {word: i for i, word in enumerate(words)}
B = [[1e-10] * len(words) for _ in range(len(pos_tags))]
for line in train_data:
word, pos_tag = process_line(line)
B[pos_tag2id[pos_tag]][word2id[word]] += 1
for i in range(len(pos_tags)):
total_count = sum(B[i])
for j in range(len(words)):
B[i][j] /= total_count
return B, word2id
def forward(obs_seq, A, B, pos_tag_count):
T = len(obs_seq)
alpha = [[0] * pos_tag_count for _ in range(T)]
for i in range(pos_tag_count):
alpha[0][i] = A[0][i] * B[i][obs_seq[0]]
for t in range(1, T):
for i in range(pos_tag_count):
alpha[t][i] = sum(alpha[t - 1][j] * A[j][i] * B[i][obs_seq[t]] for j in range(pos_tag_count))
return alpha
def backward(obs_seq, A, B, pos_tag_count):
T = len(obs_seq)
beta = [[0] * pos_tag_count for _ in range(T)]
for i in range(pos_tag_count):
beta[T - 1][i] = 1
for t in range(T - 2, -1, -1):
for i in range(pos_tag_count):
beta[t][i] = sum(beta[t + 1][j] * A[i][j] * B[j][obs_seq[t + 1]] for j in range(pos_tag_count))
return beta
def forward_backward(obs_seq, A, B, pos_tag_count):
alpha = forward(obs_seq, A, B, pos_tag_count)
beta = backward(obs_seq, A, B, pos_tag_count)
gamma = [[alpha[t][i] * beta[t][i] for i in range(pos_tag_count)] for t in range(len(obs_seq))]
for t in range(len(obs_seq)):
total = sum(gamma[t])
for i in range(pos_tag_count):
gamma[t][i] /= total
return gamma
def pos_tagging(A, B, pos_tags, word2id):
while True:
sentence = input("请输入一句话(按 c 退出):")
if sentence == "c":
break
obs_seq = [word2id.get(word, -1) for word in sentence]
if -1 in obs_seq:
print("存在未知单词!")
continue
gamma = forward_backward(obs_seq, A, B, len(pos_tags))
pos_tags_list = [pos_tags[i] for i in [max(range(len(pos_tags)),
key=lambda i: gamma[t][i]) for t in range(len(obs_seq))]]
print(" ".join([f"{word}/{pos_tag}" for word, pos_tag in zip(sentence, pos_tags_list)]))
def main():
filepath = "msr_training.txt"
train_data, test_data = load_data(filepath)
words, pos_tags, word2pos = count_words_and_pos(train_data)
A, pos_tag2id = build_transition_matrix(pos_tags, train_data)
B, word2id = build_observation_matrix(words, pos_tags, train_data, pos_tag2id)
correct_count = 0
total_count = 0
for line in test_data:
word, pos_tag = process_line(line)
obs_seq = [word2id.get(word, -1)]
if obs_seq[0] == -1:
continue
gamma = forward_backward(obs_seq, A, B, len(pos_tags))
predict_pos_tags = [pos_tags[i] for i in [max(range(len(pos_tags)),
key=lambda i: gamma[t][i]) for t in range(len(obs_seq))]]
correct_pos_tags = [pos_tag]
while len(predict_pos_tags) < len(correct_pos_tags):
predict_pos_tags.append('UNKNOWN')
total_count += len(correct_pos_tags)
correct_count += sum([1 for p, c in zip(predict_pos_tags, correct_pos_tags) if p == c])
accuracy = correct_count / total_count
print('准确率为:{:.2%}'.format(accuracy))
pos_tagging(A, B, pos_tags, word2id)
if __name__ == "__main__":
main()
程序的主要步骤如下:
- 加载数据,划分训练集和测试集:
load_data
函数从文件中读取语料数据,然后将其随机打乱,按照 80% 和 20% 的比例划分为训练集和测试集。 - 处理每一行数据:
process_line
函数接受一个字符串作为输入,返回包含词和对应词性的元组。 - 统计词和词性:
count_words_and_pos
函数统计训练集中出现的所有词和词性。 - 建立状态转移矩阵:
build_transition_matrix
函数根据训练数据计算词性之间的转移概率。 - 建立观测矩阵:
build_observation_matrix
函数根据训练数据计算给定词性的情况下,观测到某个词的概率。 - 实现前向算法:
forward
函数用于计算前向概率。 - 实现后向算法:
backward
函数用于计算后向概率。 - 实现前向-后向算法:
forward_backward
函数结合前向概率和后向概率,计算每个时间步上的词性概率。 - 词性标注:
pos_tagging
函数实现了一个简单的交互式词性标注功能,用户可以输入一句话,程序会输出每个词的词性。 - 主函数:
main
函数负责调用上述所有函数,对测试集进行评估,并提供词性标注功能。
程序的大致思路如下:
-
加载和处理数据:从文件中读取数据,并将每一行分割成词和对应的词性。这些数据被随机打乱,然后分割成训练集和测试集。
-
计数和建立矩阵:对训练集中出现的词和词性进行统计,并建立状态转移矩阵和观测矩阵。状态转移矩阵表示从一个词性转移到另一个词性的概率,观测矩阵表示给定词性下生成某个词的概率。
-
实现前向和后向算法:前向算法用于计算在给定观测序列的前提下,模型到达某个特定状态的所有路径的概率之和。后向算法则用于计算给定模型和某个特定状态,从这个状态到观测序列结束的所有路径的概率之和。结合前向和后向算法,可以计算出每个时间步上的词性概率。
-
词性标注:在输入的句子中,对每个词进行词性标注。这是通过查找最大的词性概率来实现的。
-
评估:在测试集上对模型进行评估,计算准确率。准确率是指正确标注的词性数量占总词数的比例。