知识图谱-命名实体-关系-免费标注工具-快速打标签-Python3 (https://blog.csdn.net/AdamCY888/article/details/127613010)
文章中的recode_2.py
文件,更新代码!
# -*- coding: utf-8 -*-
"""
2023/5/18 更新
CHEN | YNU
"""
#-----只需在这里修改 文件地址、 数字 即可↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
filea = r'C:\Users\DELL\Desktop\测试数据\原始文本-noneuser-03-14-17-03-50_分段_03-14-17-09-27.anns'
# 导出原始五元组 : keytime = 1
# 导出一对一标签 : keytime = 2
# 导出json格式7元组: keytime = 3
# 导出json格式文件: keytime = 4
keytime = 1
#-----只需在这里修改 文件地址、数字 即可↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
import re
from datetime import datetime
def sanyuanzu1(entuty_list): #5元素元组
entity_dict = {}
entity_lists = []
i = 0
for entii in entuty_list:
if ("O" not in entii) and (len(entii) > 2): #带有关系的实体行
i += 1
if len(entii) > 4: #多个关系
entii.insert(0, i)
j = -1
# print(2)
while True:
# print(1)
try:
j += 2
lin = entii[:3]
m = 2+j
mm = 4+j
entii[m]
lin.extend(entii[m: mm])
entity_lists.append(lin)
except:
break
elif len(entii) == 4: #一个关系
entii.insert(0, i)
entity_lists.append(entii)
else:
# print(entii)
pass
return wordtoch(sanyuanzu2(entity_lists))
def sanyuanzu1ooo(entuty_list): #5元素元组
entity_dict = {}
entity_lists = []
i = 0
for entii in entuty_list:
if ("O" not in entii) and (len(entii) > 2): #带有关系的实体行
i += 1
if len(entii) > 4: #多个关系
entii.insert(0, i)
j = -1
# print(2)
while True:
# print(1)
try:
j += 2
lin = entii[:3]
m = 2+j
mm = 4+j
entii[m]
lin.extend(entii[m: mm])
entity_lists.append(lin)
except:
break
elif len(entii) == 4: #一个关系
entii.insert(0, i)
entity_lists.append(entii)
else:
# print(entii)
pass
return wordtochooo(sanyuanzu2(entity_lists))
def sanyuanzu2(entity_lists):
entitysan_list = []
for index, relation in enumerate(entity_lists):
for relation2 in entity_lists[index+1:]:
# print(index, relation2)
if relation[3] == relation2[3]:
# print(relation[3])
# pass
if "1" in relation[-1]:
entitysan_list.append([relation[1],relation[2],relation[-1],relation2[2],relation2[1]])
else:
entitysan_list.append([relation2[1],relation2[2],relation2[-1],relation[2],relation[1]])
break #break 帮助在打标签时,能够断断续续多次标注。|匹配到最近的一对实体后,不再继续寻找。
return entitysan_list
def wordtoch(words):
# zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']
# en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']
# ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]
zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]
en = ['dis','hyp','oth','med','dia','cur', "none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]
ti = ["A","B","C","D","E","F", "Q", "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]
dic1 = dict(zip(ti, zh))
# dic2 = dict(zip(en,ti ))
dic3 = dict(zip(ti,en))
# print(len(zh))
# print(len(en))
# print(len(ti))
for word in words:
try:
word[1] = dic1[word[1]]
word[2] = dic1[word[2][0]]
word[3] = dic1[word[3]]
except:
continue
now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
new_filename = file_name[:-5] + '_五元组_' +now_time +'.csv'
filew = f = open(new_filename, 'w', encoding="utf-8")
for word in words:
# print(word)
filew.write(str(word).strip('[').strip(']') +'\n')
filew.close()
print('5元组文件已导出!')
return words
def wordtochooo(words):
# zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']
# en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']
# ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]
zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]
en = ['dis','hyp','oth','med','dia','cur', "none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]
ti = ["A","B","C","D","E","F", "Q", "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]
dic1 = dict(zip(ti, zh))
# dic2 = dict(zip(en,ti ))
dic3 = dict(zip(ti,en))
# print(len(zh))
# print(len(en))
# print(len(ti))
for word in words:
try:
word[1] = dic1[word[1]]
word[2] = dic1[word[2][0]]
word[3] = dic1[word[3]]
except:
continue
return words
def readfile(file):
f = open(file, "r", encoding='utf-8').readlines()
entuty_list = []
for i in f:
# print(i.strip('\n'))
j = i.strip('\n')
# print(j)
j = re.split(" |@|_", j)
# print(j)
entuty_list.append(j)
# print(entuty_list)
return entuty_list
#打标签
def tag_entity(word_list, label, schema='BIEO' ):
"""将实体字列表(word_list)中的每个字按照给定的模式(schema)打上
对应的标签(label)
:param word_list: 将实体词拆成单字组成的列表
:param label: 实体对应的标签
:param schema: 标注方法
:return:
"""
output_list = []
list_len = len(word_list)
if list_len == 1: #单字符
if schema == 'BIEO':
return word_list[0] + ' ' + 'B-' + label + '\n'
else: #'BI'
return word_list[0] + ' ' + 'B-' + label + '\n'
else:
if schema == 'BIEO':
for idx in range(list_len):
if idx == 0:
pair = word_list[idx] + ' ' + 'B-' + label + '\n'
elif idx == list_len - 1:
pair = word_list[idx] + ' ' + 'E-' + label + '\n'
else:
pair = word_list[idx] + ' ' + 'I-' + label + '\n'
output_list.append(pair)
else: #'BI'
for idx in range(list_len):
if idx == 0:
pair = word_list[idx] + ' ' + 'B-' + label + '\n'
else:
pair = word_list[idx] + ' ' + 'I-' + label + '\n'
output_list.append(pair)
return output_list
def biaoqian(file_list):
# zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']
# en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']
# ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]
zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]
en = ['dis','hyp','oth','med','dia','cur', "none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]
ti = ["A","B","C","D","E","F", "Q", "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]
dic1 = dict(zip(ti, zh))
# dic2 = dict(zip(en,ti ))
dic3 = dict(zip(ti,en))
for entii in file_list:
if ("O" in entii) or (("O" not in entii) and (len(entii) == 2)) :
klist = [k+' '+'O\n' for k in entii[0]]
ms_list.append(klist) #存储字符和标签
elif ("O" not in entii) and (len(entii) > 2):
if len(entii) == 4:
word_list = list(entii[0])
label = dic3[entii[-1][0]] + '-' + entii[-1][-1]
ms_list.append(tag_entity(word_list, label))
elif len(entii) > 4:
word_list = list(entii[0])
label = "main-1"
ms_list.append(tag_entity(word_list, label))
else:
pass
else:
pass
def writefile(ms_list):
now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
new_filename = file_name[:-5] + '_一对一_' +now_time +'.anns'
f = open(new_filename, 'w', encoding='utf-8')
for i in ms_list:
for j in i:
if '。' in j:
f.write(j+'\n')
else:
f.write(j)
f.close()
print("已经输出ann文件!")
def Soooooda(entuty_list): # 句子分段,再逐一段传给下游任务
juzilist = [] # 将同一个句子的元素放在一起
juziid = 0 # 句子的id,防止同样的句子出现混乱
result_list = [] #输出结果list
juzi = ''
for idx, entii in enumerate(entuty_list):
aidx = len(entuty_list)
if (len(entii) == 1) and (len(entii[0])==0):
juziid += 1
a = sanyuanzu1ooo(juzilist)
result_list.append([juziid, juzi, a])
juzi = ''
juzilist = []
elif (len(entii) > 1) and ((idx+1) < aidx):
juzi += entii[0]
juzilist.append(entii)
elif (idx+1) == aidx:
juziid += 1
juzi += entii[0]
juzilist.append(entii)
a = sanyuanzu1ooo(juzilist)
result_list.append([juziid, juzi, a])
else:
pass
# print("这儿")
# print(result_list)
return result_list
def write_Soooooda(result_list):
now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
new_filename = file_name[:-5] + '_7元组_' +now_time +'.csv'
f = open(new_filename, 'w', encoding='utf-8')
for i_list in result_list:
if len(i_list[-1]) >= 1:
for wordlist in i_list[-1]:
f.write(str(i_list[0])+","+ i_list[1].replace(",",",") +","+ str(wordlist).strip('[').strip(']') +'\n')
else:
pass
f.close()
print("已输出7元组文件")
return
# '''
# s = [
# {"text": "查尔斯·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部",
# "new_spo_list": [
# {"s": {"entity": "查尔斯·阿兰基斯","type": "people"},
# "p": {"entity": "出生地","type": "_rel"},
# "o": {"entity": "圣地亚哥","type": "property"}},
# {"s": {"entity": "查尔斯·阿兰基斯","type": "people"},
# "p": {"entity": "出生地","type": "_rel"},
# "o": {"entity": "圣地亚哥","type": "property"}}
# ]
# },
# {"text": "查尔斯1·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部",
# "new_spo_list":
# }
# ]
# '''
import json
def spo(alis):
# ['保质期长', '故障编号', '故障影响位置', '稳定要求', '经济价值']
'''
{"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
"p": {"entity": "出生地","type": "_rel"},
"o": {"entity": "圣地亚哥","type": "property"}
}
'''
stri = {"s": {"entity":alis[0], "type":alis[1]}, "p":{"entity":alis[2], "type": "_rel"}, "o":{"entity":alis[4],"type":alis[3]}}
return stri
def new_spo_list(i_lista):
# '''
# [
# ['云南牛干巴', '指标', '具体要求为', '程度', '云南省回族人民'],
# ['传统干腌肉制品', '动作', '对应指标', '指标', '牛干巴'],
# ['保质期长', '故障编号', '故障影响位置', '动作', '易于携带'],
# ['保质期长', '故障编号', '故障影响位置', '稳定要求', '经济价值']
# ]
# '''
# [
# {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
# "p": {"entity": "出生地","type": "_rel"},
# "o": {"entity": "圣地亚哥","type": "property"}},
# {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
# "p": {"entity": "出生地","type": "_rel"},
# "o": {"entity": "圣地亚哥","type": "property"}}
# ]
jj = []
for ij in i_lista:
jj.append(spo(ij))
return jj
def Soooooda_json(Soooooda_list):
result = Soooooda_list
result_new = []
list_wai = []
for i_list in result:
if len(i_list[-1]) >= 1:
result_new.append(i_list)
else:
pass
for i_lista in result_new:
list_u = {"text": i_lista[1], "new_spo_list": new_spo_list(i_lista[-1])}
list_wai.append(list_u)
now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
new_filename = file_name[:-5] + '_json_' +now_time +'.txt'
f = open(new_filename, 'w', encoding='utf-8')
wri = str(list_wai).replace("'", '"')
f.write(wri)
f.close()
print("json-txt写入完成")
return wri
if __name__ == "__main__":
'''
file_list = readfile(file_name)
sanyuanzu1(file_list) #输出三元组-五元组文件
'''
schema="BIEO"
rep=r'\[<.*?\⊙'
file_name = filea
ms_list = []
file_list = readfile(file_name)
file_wlist = readfile(file_name)
Soooooda_list = readfile(file_name)
# print(file_list)
if keytime == 1:
sanyuanzu1(file_list) #输出三元组-五元组文件
elif keytime == 2:
biaoqian(file_wlist) # 付式 输出标签文件
writefile(ms_list)
elif keytime == 3: # Soooooda新增功能:1
write_Soooooda(Soooooda(Soooooda_list))
elif keytime == 4: # Soooooda新增功能:2
Soooooda_json(Soooooda(Soooooda_list))
else:
print(f"请修正keytime={keytime}为1、2、3 或 4, 并重新运行程序")