duee信息提取篇章级数据预处理

import os
import sys
import json

def read_by_lines(path):
    result = list()
    with open(path, "r", encoding="utf8") as infile:
        for line in infile:
            result.append(line.strip())
    return result

def write_by_lines(path, data):
    with open(path, "w", encoding="utf8") as outfile:
        [outfile.write(d + "\n") for d in data]

def text_to_sents(text):
    # 包含中文句子分隔符的 Unicode 字符串列表。这些分隔符用于在中文文本中识别和分割句子
    delimiter_symbols = [u"。", u"?", u"!"]#。?!
    paragraphs = text.split("\n")#按换行符拆分文本
    ret = []
    for para in paragraphs:#para:指其中的每个按行分割的一个个文本段
        if para.strip()  == u"":
            continue
        sents = [u""]
        for s in para:#遍历文本段中的每个字符
            # print(s)#每个字符
            sents[-1] += s
            # print(s)
            # print(len(sents))
            if s in delimiter_symbols :#如果够一句话了,就为这个文本段新开始个,表示一个文本段中第二个句子
                sents.append(u"")
            # print(len(sents))
        # print('--------------------')
        if sents[-1] == u"":#如果这个文本段以空字符串结尾
            sents = sents[:-1]
        ret.extend(sents)#
    return ret

a=text_to_sents('这些分隔符用于在中文文本中识别和分割句子。mmm')

import hashlib

def calculate_md5(input_str):  
    md5_hash = hashlib.md5(input_str.encode('utf-8')).hexdigest()  
    return md5_hash  
input_string = "hello world"  
print(calculate_md5(input_string))

conf_dir = "./conf/DuEE-Fin"

if not os.path.exists(conf_dir):
        os.makedirs(conf_dir)

schema_path = "./datasets/DuEE-fin/duee_fin_event_schema.json"

tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir)#保存trigger标签
tags_role_path = "{}/role_tag.dict".format(conf_dir)#保存role标签
tags_enum_path = "{}/enum_tag.dict".format(conf_dir)#保存枚举

def label_add(labels, _type):
        if "B-{}".format(_type) not in labels:#没在列表里面就追加
            labels.extend(["B-{}".format(_type), "I-{}".format(_type)])
        return labels

# trigger,触发词
schema_l=read_by_lines(schema_path)# schema
labels = []
for  line in schema_l:
    d_json = json.loads(line.strip())
    labels = label_add(labels, d_json["event_type"])
labels.append("O")
tags_trigger = []#
for index, label in enumerate(labels):#保存在列表,为了写到文件方便
    tags_trigger.append("{}\t{}".format(index, label))

write_by_lines(tags_trigger_path, tags_trigger)

enum_role = "环节"
labels = []
for  line in schema_l:
    d_json = json.loads(line.strip())
    for role in d_json["role_list"]:
            if role["role"] == enum_role:#
                continue
            labels = label_add(labels, role["role"])
labels.append("O")
tags_roles = []#
for index, label in enumerate(labels):#里面元素是字典形式
    tags_roles.append("{}\t{}".format(index, label))

write_by_lines(tags_role_path, tags_roles)

enum_role = "环节"
labels = []
for  line in schema_l:
    d_json = json.loads(line.strip())
    for role in d_json["role_list"]:
            if role["role"] == enum_role:
                labels = role["enum_items"]
tags_enums = []
for index, label in enumerate(labels):#里面元素是映射对的样式
    tags_enums .append("{}\t{}".format(index, label))

write_by_lines(tags_enum_path, tags_enums)

# data process
data_dir = "./datasets/DuEE-Fin"

sentence_dir = "{}/sentence".format(data_dir)

trigger_save_dir = "{}/trigger".format(data_dir)

role_save_dir = "{}/role".format(data_dir)
enum_save_dir = "{}/enum".format(data_dir)

if not os.path.exists(sentence_dir):
    os.makedirs(sentence_dir)

x_train = read_by_lines( "./datasets/DuEE-fin/duee_fin_train.json")

def argument_in_sent(sent, argument_list, trigger):
        trigger_start = sent.find(

  • 8
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值