def convert_biotext_to_json(input_file, save_file, format="json"):
'''
对于crf三种标注类型,bio, bieso, bmeo均会转为bio标签系统
'''
data = []
with open(input_file, "r", encoding='utf-8') as f:
text = ''
labels = []
for line in f.readlines():
if len(line.strip()) == 0:
assert len(text) == len(labels)
data.append({
'text': text,
'labels': labels
})
text = ''
labels = []
else:
w, t = line.strip().split()
text += w
if t.startswith("E"):
labels.append("I"+t[1:])
elif t.startswith("M"):
labels.append("I"+t[1:])
elif t.startswith("S"):
labels.append("B"+t[1:])
else:
labels.append(t)
for line in data:
text = line["text"]
bios = line["labels"]
entities = []
start_index, end_index = -1, -1
ent_type = None
for indx, tag in enumerate(bios):
if tag.startswith("B-"):
if end_index != -1:
entities.append(
{
"start_idx": start_index,
"end_idx": end_index,
"type": ent_type,
"entity": text[start_index:end_index + 1]
}
)
# 新的实体
start_index = indx
end_index = indx
ent_type = tag.split('-')[1]
if indx == len(bios) - 1:
entities.append(
{
"start_idx": start_index,
"end_idx": end_index,
"type": ent_type,
"entity": text[start_index:end_index + 1]
}
)
elif tag.startswith('I-') and start_index != -1:
_type = tag.split('-')[1]
if _type == ent_type:
end_index = indx
if indx == len(bios) - 1:
entities.append(
{
"start_idx": start_index,
"end_idx": end_index,
"type": ent_type,
"entity": text[start_index:end_index + 1]
}
)
else:
if end_index != -1:
entities.append(
{
"start_idx": start_index,
"end_idx": end_index,
"type": ent_type,
"entity": text[start_index:end_index + 1]
}
)
start_index, end_index = -1, -1
ent_type = None
line.pop("labels")
line["entities"] = entities
if format == "json":
json.dump({"data": data}, open(save_file, "w"), ensure_ascii=False, indent=4)
else:
with open(save_file, "w") as f:
for line in data:
f.write(json.dumps(line, ensure_ascii=False)+"\n")
logger.info(f"*** 转化后的json数据保存在:{save_file} ***")
print(f"*** 转化后的json数据保存在:{save_file} ***")
实体抽取-将bio标签转为json文件的代码
最新推荐文章于 2024-04-02 15:05:03 发布
该代码段定义了一个函数convert_biotext_to_json,用于将不同类型的CRF标注(如BIO,BIESO,BMEO)转换为BIO格式,并保存为JSON文件。它读取输入文件,处理每一行的文本和标签,然后构造出包含实体开始和结束索引的JSON结构。最终,转换后的数据被写入到指定的输出文件中。
摘要由CSDN通过智能技术生成