中文自然语言处理--基于CRF的中文句法依存分析模型实现

句法分析是自然语言处理中的关键技术之一,其基本任务是确定句子的句法结构或者句子中词汇之间的依存关系。

主要包括两方面的内容,一是确定语言的语法体系,即对语言中合法句子的语法结构给予形式化的定义;另一方面是句法分析技术,即根据给定的语法体系,自动推导出句子的句法结构,分析句子所包含的句法单位和这些句法单位之间的关系。

实战:通过清华大学的句法标注语料库,实现基于 CRF 的中文句法依存分析模型
1.数据预处理:
给定的语料,在模型中,我们不能直接使用,通过一个 Python 脚本 get_parser_train_test_input.py,生成所需要的训练集和测试集(linux/git bash执行):

cat train.conll | python get_parser_train_test_input.py > train.data
cat dev.conll | python get_parser_train_test_input.py > dev.data

get_parser_train_test_input.py:

#coding=utf-8
'''
词A依赖词B,A就是孩子,B就是父亲
'''
import sys
sentence = ["Root"]
def do_parse(sentence):
    if len(sentence) == 1:return
    for line in sentence[1:]:
        line_arr = line.strip().split("\t")
       
        c_id = int(line_arr[0])
        f_id = int(line_arr[6])
        
        if f_id == 0:
            print("\t".join(line_arr[2:5])+"\t" + "0_Root")
            continue
        
        f_post,f_detail_post = sentence[f_id].strip().split("\t")[3:5] #得到父亲节点的粗词性和详细词性
        c_edge_post = f_post #默认是依赖词的粗粒度词性,但是名词除外;名词取细粒度词性
        if f_post == "n":
            c_edge_post = f_detail_post
        #计算是第几个出现这种词行
        diff = f_id - c_id #确定要走几步
        step = 1 if f_id > c_id  else -1 #确定每一步方向
        same_post_num = 0 #中间每一步统计多少个一样的词性
        cmp_idx = 4 if f_post == "n" else 3  #根据是否是名词决定取的是粗or详细词性
        for i in range(0, abs(diff)):
            idx = c_id + (i+1)*step
            if sentence[idx].strip().split("\t")[cmp_idx] == c_edge_post:
                same_post_num += step
        print("\t".join(line_arr[2:5])+"\t" + "%d_%s"%(same_post_num, c_edge_post))
    print("")
for line in sys.stdin:
    line = line.strip()
    line_arr = line.split("\t")
    
    if  line == "" or line_arr[0] == "1":
        
        do_parse(sentence)
        sentence = ["Root"]
    if line =="":continue
    sentence.append(line)

2.CRF模型训练和预测代码:

# -*- coding: utf-8 -*-
# 使用模型 sklearn_crfsuite .CRF,metrics 用来进行模型性能测试,joblib 用来保存和加载训练好的模型
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib

dir = ".//"
# 包含特征处理方法的类
class CorpusProcess(object):

    def __init__(self):
        """
        初始化
        初始化预处理好的语料的路径
        """
        self.train_process_path = dir + "data//train2.data"  # 预处理之后的训练集
        self.test_process_path = dir + "data//dev2.data"  # 预处理之后的测试集

    def read_corpus_from_file(self, file_path):
        """读取语料"""
        # 用 open 函数来实现语料文件的读和写
        f = open(file_path, 'r', encoding='utf-8')
        lines = f.readlines()
        f.close()
        return lines

    def write_corpus_to_file(self, data, file_path):
        """写语料"""
        f = open(file_path, 'w')
        f.write(str(data))
        f.close()

    def process_sentence(self, lines):
        """
        处理句子
        process_sentence 把句子收尾的空格去掉
        """
        sentence = []
        for line in lines:
            if not line.strip():
                yield sentence
                sentence = []
            else:
                lines = line.strip().split(u'\t')
                result = [line for line in lines]
                sentence.append(result)

    def initialize(self):
        """
        语料初始化
        通过 initialize 函数调用上面 read_corpus_from_file 方法读取语料,分别加载训练集和测试集
        """
        train_lines = self.read_corpus_from_file(self.train_process_path)
        test_lines = self.read_corpus_from_file(self.test_process_path)
        self.train_sentences = [sentence for sentence in self.process_sentence(train_lines)]
        self.test_sentences = [sentence for sentence in self.process_sentence(test_lines)]
        print("self.train_sentences:", len(self.train_sentences), self.train_sentences[:2])

    def generator(self, train=True):
        """
        特征生成器
        用来指定生成训练集或者测试集的特征集
        """
        # 对训练集和测试集分别处理,如果参数 train 为 True,则表示处理训练集,如果是 False,则表示处理测试集
        if train:
            sentences = self.train_sentences
        else:
            sentences = self.test_sentences
        return self.extract_feature(sentences)

    def extract_feature(self, sentences):
        """
        提取特征
        简单的进行 3-gram 的抽取,将词性与词语两两进行匹配,分别返回特征集合和标签集合
        """
        features, tags = [], []
        for index in range(len(sentences)):
            feature_list, tag_list = [], []
            for i in range(len(sentences[index])):
                feature = {"w0": sentences[index][i][0],
                           "p0": sentences[index][i][1],
                           "w-1": sentences[index][i - 1][0] if i != 0 else "BOS",
                           "w+1": sentences[index][i + 1][0] if i != len(sentences[index]) - 1 else "EOS",
                           "p-1": sentences[index][i - 1][1] if i != 0 else "un",
                           "p+1": sentences[index][i + 1][1] if i != len(sentences[index]) - 1 else "un"}
                feature["w-1:w0"] = feature["w-1"] + feature["w0"]
                feature["w0:w+1"] = feature["w0"] + feature["w+1"]
                feature["p-1:p0"] = feature["p-1"] + feature["p0"]
                feature["p0:p+1"] = feature["p0"] + feature["p+1"]
                feature["p-1:w0"] = feature["p-1"] + feature["w0"]
                feature["w0:p+1"] = feature["w0"] + feature["p+1"]
                feature_list.append(feature)
                tag_list.append(sentences[index][i][-1])
            features.append(feature_list)
            tags.append(tag_list)
        return features, tags

class ModelParser(object):

    def __init__(self):
        """
        初始化参数
        实现算法模型参数和语料预处理 CorpusProcess 类的实例化和初始化
        """
        self.algorithm = "lbfgs"
        self.c1 = 0.1
        self.c2 = 0.1
        self.max_iterations = 100
        self.model_path = "model.pkl"
        self.corpus = CorpusProcess()  # 初始化CorpusProcess类
        self.corpus.initialize()  # 语料预处理
        self.model = None

    def initialize_model(self):
        """模型初始化"""
        algorithm = self.algorithm
        c1 = float(self.c1)
        c2 = float(self.c2)
        max_iterations = int(self.max_iterations)
        # 实现 sklearn_crfsuite.CRF 模型的初始化
        self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2,
                                          max_iterations=max_iterations, all_possible_transitions=True)

    def train(self):
        """训练"""
        self.initialize_model()
        x_train, y_train = self.corpus.generator()
        print("x_train:\n", x_train[:2])
        print("y_train:\n", y_train[:2])
        # fit 方法训练模型
        self.model.fit(x_train, y_train)
        labels = list(self.model.classes_)
        print("labels:", labels)
        x_test, y_test = self.corpus.generator(train=False)
        y_predict = self.model.predict(x_test)
        print("y_test:\n", y_test[:2])
        print("y_predict:\n", y_predict[:2])
        # metrics.flat_f1_score 对测试集进行 F1 性能测试
        metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels)
        sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
        print("sorted_labels:", sorted_labels)
        print(metrics.flat_classification_report(y_test, y_predict, labels=sorted_labels, digits=3))
        # 模型保存
        self.save_model()

    def predict(self, sentences):
        """模型预测"""
        self.load_model()
        features, _ = self.corpus.extract_feature(sentences)
        return self.model.predict(features)

    def load_model(self, name='model'):
        """加载模型 """
        self.model = joblib.load(self.model_path)

    def save_model(self, name='model'):
        """保存模型"""
        joblib.dump(self.model, self.model_path)


model = ModelParser()
model.train()

sen =[[['坚决', 'a', 'ad', '1_v'],
        ['惩治', 'v', 'v', '0_Root'],
        ['贪污', 'v', 'v', '1_v'],
        ['贿赂', 'n', 'n', '-1_v'],
        ['等', 'u', 'udeng', '-1_v'],
        ['经济', 'n', 'n', '1_v'],
        ['犯罪', 'v', 'vn', '-2_v']]]
print(model.predict(sen))

原文:
https://soyoger.blog.csdn.net/article/details/108729395
数据:
https://codechina.csdn.net/mirrors/sujeek/chinese_nlp?utm_source=csdn_github_accelerator

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值