句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。
主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法句子的语法结构给予形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。
实战:通过清华大学的句法标注语料库,实现基于 CRF 的中文句法依存分析模型
1.数据预处理:
给定的语料,在模型中,我们不能直接使用,通过一个 Python 脚本 get_parser_train_test_input.py,生成所需要的训练集和测试集(linux/git bash执行):
cat train.conll | python get_parser_train_test_input.py > train.data
cat dev.conll | python get_parser_train_test_input.py > dev.data
get_parser_train_test_input.py:
#coding=utf-8
'''
词A依赖词B,A就是孩子,B就是父亲
'''
import sys
sentence = ["Root"]
def do_parse(sentence):
if len(sentence) == 1:return
for line in sentence[1:]:
line_arr = line.strip().split("\t")
c_id = int(line_arr[0])
f_id = int(line_arr[6])
if f_id == 0:
print("\t".join(line_arr[2:5])+"\t" + "0_Root")
continue
f_post,f_detail_post = sentence[f_id].strip().split("\t")[3:5] #得到父亲节点的粗词性和详细词性
c_edge_post = f_post #默认是依赖词的粗粒度词性,但是名词除外;名词取细粒度词性
if f_post == "n":
c_edge_post = f_detail_post
#计算是第几个出现这种词行
diff = f_id - c_id #确定要走几步
step = 1 if f_id > c_id else -1 #确定每一步方向
same_post_num = 0 #中间每一步统计多少个一样的词性
cmp_idx = 4 if f_post == "n" else 3 #根据是否是名词决定取的是粗or详细词性
for i in range(0, abs(diff)):
idx = c_id + (i+1)*step
if sentence[idx].strip().split("\t")[cmp_idx] == c_edge_post:
same_post_num += step
print("\t".join(line_arr[2:5])+"\t" + "%d_%s"%(same_post_num, c_edge_post))
print("")
for line in sys.stdin:
line = line.strip()
line_arr = line.split("\t")
if line == "" or line_arr[0] == "1":
do_parse(sentence)
sentence = ["Root"]
if line =="":continue
sentence.append(line)
2.CRF模型训练和预测代码:
# -*- coding: utf-8 -*-
# 使用模型 sklearn_crfsuite .CRF,metrics 用来进行模型性能测试,joblib 用来保存和加载训练好的模型
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib
dir = ".//"
# 包含特征处理方法的类
class CorpusProcess(object):
def __init__(self):
"""
初始化
初始化预处理好的语料的路径
"""
self.train_process_path = dir + "data//train2.data" # 预处理之后的训练集
self.test_process_path = dir + "data//dev2.data" # 预处理之后的测试集
def read_corpus_from_file(self, file_path):
"""读取语料"""
# 用 open 函数来实现语料文件的读和写
f = open(file_path, 'r', encoding='utf-8')
lines = f.readlines()
f.close()
return lines
def write_corpus_to_file(self, data, file_path):
"""写语料"""
f = open(file_path, 'w')
f.write(str(data))
f.close()
def process_sentence(self, lines):
"""
处理句子
process_sentence 把句子收尾的空格去掉
"""
sentence = []
for line in lines:
if not line.strip():
yield sentence
sentence = []
else:
lines = line.strip().split(u'\t')
result = [line for line in lines]
sentence.append(result)
def initialize(self):
"""
语料初始化
通过 initialize 函数调用上面 read_corpus_from_file 方法读取语料,分别加载训练集和测试集
"""
train_lines = self.read_corpus_from_file(self.train_process_path)
test_lines = self.read_corpus_from_file(self.test_process_path)
self.train_sentences = [sentence for sentence in self.process_sentence(train_lines)]
self.test_sentences = [sentence for sentence in self.process_sentence(test_lines)]
print("self.train_sentences:", len(self.train_sentences), self.train_sentences[:2])
def generator(self, train=True):
"""
特征生成器
用来指定生成训练集或者测试集的特征集
"""
# 对训练集和测试集分别处理,如果参数 train 为 True,则表示处理训练集,如果是 False,则表示处理测试集
if train:
sentences = self.train_sentences
else:
sentences = self.test_sentences
return self.extract_feature(sentences)
def extract_feature(self, sentences):
"""
提取特征
简单的进行 3-gram 的抽取,将词性与词语两两进行匹配,分别返回特征集合和标签集合
"""
features, tags = [], []
for index in range(len(sentences)):
feature_list, tag_list = [], []
for i in range(len(sentences[index])):
feature = {"w0": sentences[index][i][0],
"p0": sentences[index][i][1],
"w-1": sentences[index][i - 1][0] if i != 0 else "BOS",
"w+1": sentences[index][i + 1][0] if i != len(sentences[index]) - 1 else "EOS",
"p-1": sentences[index][i - 1][1] if i != 0 else "un",
"p+1": sentences[index][i + 1][1] if i != len(sentences[index]) - 1 else "un"}
feature["w-1:w0"] = feature["w-1"] + feature["w0"]
feature["w0:w+1"] = feature["w0"] + feature["w+1"]
feature["p-1:p0"] = feature["p-1"] + feature["p0"]
feature["p0:p+1"] = feature["p0"] + feature["p+1"]
feature["p-1:w0"] = feature["p-1"] + feature["w0"]
feature["w0:p+1"] = feature["w0"] + feature["p+1"]
feature_list.append(feature)
tag_list.append(sentences[index][i][-1])
features.append(feature_list)
tags.append(tag_list)
return features, tags
class ModelParser(object):
def __init__(self):
"""
初始化参数
实现算法模型参数和语料预处理 CorpusProcess 类的实例化和初始化
"""
self.algorithm = "lbfgs"
self.c1 = 0.1
self.c2 = 0.1
self.max_iterations = 100
self.model_path = "model.pkl"
self.corpus = CorpusProcess() # 初始化CorpusProcess类
self.corpus.initialize() # 语料预处理
self.model = None
def initialize_model(self):
"""模型初始化"""
algorithm = self.algorithm
c1 = float(self.c1)
c2 = float(self.c2)
max_iterations = int(self.max_iterations)
# 实现 sklearn_crfsuite.CRF 模型的初始化
self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2,
max_iterations=max_iterations, all_possible_transitions=True)
def train(self):
"""训练"""
self.initialize_model()
x_train, y_train = self.corpus.generator()
print("x_train:\n", x_train[:2])
print("y_train:\n", y_train[:2])
# fit 方法训练模型
self.model.fit(x_train, y_train)
labels = list(self.model.classes_)
print("labels:", labels)
x_test, y_test = self.corpus.generator(train=False)
y_predict = self.model.predict(x_test)
print("y_test:\n", y_test[:2])
print("y_predict:\n", y_predict[:2])
# metrics.flat_f1_score 对测试集进行 F1 性能测试
metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels)
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print("sorted_labels:", sorted_labels)
print(metrics.flat_classification_report(y_test, y_predict, labels=sorted_labels, digits=3))
# 模型保存
self.save_model()
def predict(self, sentences):
"""模型预测"""
self.load_model()
features, _ = self.corpus.extract_feature(sentences)
return self.model.predict(features)
def load_model(self, name='model'):
"""加载模型 """
self.model = joblib.load(self.model_path)
def save_model(self, name='model'):
"""保存模型"""
joblib.dump(self.model, self.model_path)
model = ModelParser()
model.train()
sen =[[['坚决', 'a', 'ad', '1_v'],
['惩治', 'v', 'v', '0_Root'],
['贪污', 'v', 'v', '1_v'],
['贿赂', 'n', 'n', '-1_v'],
['等', 'u', 'udeng', '-1_v'],
['经济', 'n', 'n', '1_v'],
['犯罪', 'v', 'vn', '-2_v']]]
print(model.predict(sen))
原文:
https://soyoger.blog.csdn.net/article/details/108729395
数据:
https://codechina.csdn.net/mirrors/sujeek/chinese_nlp?utm_source=csdn_github_accelerator