信息抽取旨在从非结构化自然语言文本中提取结构化知识,如实体、关系、事件等。事件抽取的目标是对于给定的自然语言句子,根据预先指定的事件类型和论元角色,识别句子中所有目标事件类型的事件,并根据相应的论元角色集合抽取事件所对应的论元。其中目标事件类型 (event_type) 和论元角色 (role) 限定了抽取的范围,例如 (event_type:胜负,role:时间,胜者,败者,赛事名称)、(event_type:夺冠,role:夺冠事件,夺冠赛事,冠军)。
enum_items属于里面的字段
{"text": "雀巢裁员4000人:时代抛弃你时,连招呼都不会打!", "id": "409389c96efe78d6af1c86e0450fd2d7", "event_list": [{"event_type": "组织关系-裁员", "trigger": "裁员", "trigger_start_index": 2, "arguments": [{"argument_start_index": 0, "role": "裁员方", "argument": "雀巢", "alias": []}, {"argument_start_index": 4, "role": "裁员人数", "argument": "4000人", "alias": []}], "class": "组织关系"}]}
触发词trigger是实例,事件类型event-type就是触发词实例的类别
论元argument是实例,论元角色role是论元实例的类别
就是多个基本任务的组合,分类任务组合而已,换了个名字
event_schema.json里面的event_type字段就是触发词字段trigger_tag.dict的类型字段
event_schema.json里面的class就是event_type字段里面大类
{"role_list": [{"role": "上市公司"}, {"role": "证券代码"}, {"enum_items": ["筹备上市", "暂停上市", "正式上市", "终止上市"], "role": "环节"}, {"role": "披露时间"}, {"role": "发行价格"}, {"role": "事件时间"}, {"role": "市值"}, {"role": "募资金额"}], "event_type": "公司上市", "id": "0bb90bf676836936f8d687513114b454"}
enum_items属于里面的字段
一,标签label列表生产
根据最终的事件类型,总类型表Json文件,生产各个任务分别标签文件,对应包字符的label,在类似的基础上加上B-和I-
def schema_process(path, model="trigger"):
"""schema_process"""
def label_add(labels, _type):
"""label_add"""
if "B-{}".format(_type) not in labels:
labels.extend(["B-{}".format(_type), "I-{}".format(_type)])
return labels
labels = []
for line in read_by_lines(path):
d_json = json.loads(line.strip())
if model == "trigger":
labels = label_add(labels, d_json["event_type"])
elif model == "role":
for role in d_json["role_list"]:
if role["role"] == enum_role:
continue
labels = label_add(labels, role["role"])
elif model == "enum":
for role in d_json["role_list"]:
if role["role"] == enum_role:
labels = role["enum_items"]
labels.append("O")
tags = []
for index, label in enumerate(labels):
tags.append("{}\t{}".format(index, label))
if model == "enum":
tags = tags[:-1]
return tags
二,各任务数据列表生产
先做句子级别的抽样,把篇章段落转换成句子,再从这个句子样本的基础上转换成其他任务的样本
# data process 说白了先把大段的文本抽样,转换成小段的句子文本,然后针对小段的句子文本做其他样本的构造
data_dir = "./data/DuEE-Fin"
sentence_dir = "{}/sentence".format(data_dir)
trigger_save_dir = "{}/trigger".format(data_dir)
role_save_dir = "{}/role".format(data_dir)
enum_save_dir = "{}/enum".format(data_dir)
print("\n=================start data process==============")
print("\n********** start document process **********")
if not os.path.exists(sentence_dir):
os.makedirs(sentence_dir)
train_sent = docs_data_process("{}/duee_fin_train.json".format(data_dir))
write_by_lines("{}/train.json".format(sentence_dir), train_sent)
dev_sent = docs_data_process("{}/duee_fin_dev.json".format(data_dir))
write_by_lines("{}/dev.json".format(sentence_dir), dev_sent)
test_sent = docs_data_process("{}/duee_fin_test1.json".format(data_dir))
write_by_lines("{}/test.json".format(sentence_dir), test_sent)
print("train {} dev {} test {}".format(
len(train_sent), len(dev_sent), len(test_sent)))
print("********** end document process **********")
print("\n********** start sentence process **********")
print("\n----trigger------for dir {} to {}".format(sentence_dir,
trigger_save_dir))
if not os.path.exists(trigger_save_dir):
os.makedirs(trigger_save_dir)
train_tri = data_process("{}/train.json".format(sentence_dir), "trigger")
write_by_lines("{}/train.tsv".format(trigger_save_dir), train_tri)
dev_tri = data_process("{}/dev.json".format(sentence_dir), "trigger")
write_by_lines("{}/dev.tsv".format(trigger_save_dir), dev_tri)
test_tri = data_process("{}/test.json".format(sentence_dir), "trigger")
write_by_lines("{}/test.tsv".format(trigger_save_dir), test_tri)
print("train {} dev {} test {}".format(
len(train_tri), len(dev_tri), len(test_tri)))
print("\n----role------for dir {} to {}".format(sentence_dir,
role_save_dir))
if not os.path.exists(role_save_dir):
os.makedirs(role_save_dir)
train_role = data_process("{}/train.json".format(sentence_dir), "role")
write_by_lines("{}/train.tsv".format(role_save_dir), train_role)
dev_role = data_process("{}/dev.json".format(sentence_dir), "role")
write_by_lines("{}/dev.tsv".format(role_save_dir), dev_role)
test_role = data_process("{}/test.json".format(sentence_dir), "role")
write_by_lines("{}/test.tsv".format(role_save_dir), test_role)
print("train {} dev {} test {}".format(
len(train_role), len(dev_role), len(test_role)))
print("\n----enum------for dir {} to {}".format(sentence_dir,
enum_save_dir))
if not os.path.exists(enum_save_dir):
os.makedirs(enum_save_dir)
trian_enum = enum_data_process("{}/train.json".format(sentence_dir))
write_by_lines("{}/train.tsv".format(enum_save_dir), trian_enum)
dev_enum = enum_data_process("{}/dev.json".format(sentence_dir))
write_by_lines("{}/dev.tsv".format(enum_save_dir), dev_enum)
test_enum = enum_data_process("{}/test.json".format(sentence_dir))
write_by_lines("{}/test.tsv".format(enum_save_dir), test_enum)
print("train {} dev {} test {}".format(
len(trian_enum), len(dev_enum), len(test_enum)))
print("********** end sentence process **********")
print("=================end data process==============")
三,各任务训练
取通用的样本数据预测
保存在各自模型的目录
提交的时候把各个任务的预测结果进行融合成最终文件
自定义数据读取
class DuEventExtraction(paddle.io.Dataset):
"""Du"""
def __init__(self, data_path, tag_path):
self.label_vocab = load_dict(tag_path)
self.examples = self._read_tsv(data_path)
def _read_tsv(self, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
headers = next(reader)
text_indices = [
index for index, h in enumerate(headers) if h != "label"]
Example = namedtuple('Example', headers)
examples = []
for line in reader:
for index, text in enumerate(line):
if index in text_indices:
line[index] = text
try:
example = Example(*line)
except Exception as e:
traceback.print_exc()
raise Exception(e)
examples.append(example)
return examples
def __len__(self):
return len(self.examples)
def __getitem__(self, index):
return self.examples[index]
class DuEventExtraction(paddle.io.Dataset):
"""DuEventExtraction"""
def __init__(self, data_path, tag_path):
self.label_vocab = load_dict(tag_path)
self.word_ids = []
self.label_ids = []
with open(data_path, 'r', encoding='utf-8') as fp:
# skip the head line
next(fp)
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
self.word_ids.append(words)
self.label_ids.append(labels)
self.label_num = max(self.label_vocab.values()) + 1
def __len__(self):
return len(self.word_ids)
def __getitem__(self, index):
return self.word_ids[index], self.label_ids[index]