recode_2.py文件更新代码！

高山莫衣
已于 2023-05-18 15:06:22 修改
阅读量297
点赞数 4
文章标签：知识图谱 python 人工智能
于 2023-05-18 14:50:57 首次发布
原创作品，共同进步！
本文链接：https://blog.csdn.net/AdamCY888/article/details/130747036
版权
该文章介绍了一个用于知识图谱构建的Python工具，能对原始文本进行命名实体和关系的快速标注，支持生成五元组、一对一标签、七元组格式的数据，并可导出为CSV或JSON文件。工具适用于处理包含多个关系的实体行，且提供了转换标签的字典映射功能。
摘要由CSDN通过智能技术生成
知识图谱-命名实体-关系-免费标注工具-快速打标签-Python3 (https://blog.csdn.net/AdamCY888/article/details/127613010)
文章中的recode_2.py文件，更新代码！
# -*- coding: utf-8 -*-
"""
2023/5/18 更新
CHEN | YNU
"""

#-----只需在这里修改 文件地址、 数字 即可↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓

filea = r'C:\Users\DELL\Desktop\测试数据\原始文本-noneuser-03-14-17-03-50_分段_03-14-17-09-27.anns'

# 导出原始五元组 ： keytime = 1
# 导出一对一标签 ： keytime = 2
# 导出json格式7元组： keytime = 3
# 导出json格式文件： keytime = 4

keytime = 1

#-----只需在这里修改 文件地址、数字 即可↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑



import re
from datetime import datetime



def sanyuanzu1(entuty_list): #5元素元组
    entity_dict = {}
    entity_lists = []

    i = 0
    for entii in entuty_list:
        
        if ("O" not in entii) and (len(entii) > 2): #带有关系的实体行
            
            i += 1
            if len(entii) > 4: #多个关系
                entii.insert(0, i)
                j = -1
                # print(2)
                while True:
                    # print(1)
                    try:
                        j += 2
                        lin = entii[:3]
                        m = 2+j
                        mm = 4+j
                        entii[m]
                        lin.extend(entii[m: mm])
                        entity_lists.append(lin)
                    except:
                        break
            elif len(entii) == 4: #一个关系
                entii.insert(0, i)
                entity_lists.append(entii)   
            else:
                # print(entii)
                pass
    return  wordtoch(sanyuanzu2(entity_lists))

def sanyuanzu1ooo(entuty_list): #5元素元组
    entity_dict = {}
    entity_lists = []

    i = 0
    for entii in entuty_list:
        
        if ("O" not in entii) and (len(entii) > 2): #带有关系的实体行
            
            i += 1
            if len(entii) > 4: #多个关系
                entii.insert(0, i)
                j = -1
                # print(2)
                while True:
                    # print(1)
                    try:
                        j += 2
                        lin = entii[:3]
                        m = 2+j
                        mm = 4+j
                        entii[m]
                        lin.extend(entii[m: mm])
                        entity_lists.append(lin)
                    except:
                        break
            elif len(entii) == 4: #一个关系
                entii.insert(0, i)
                entity_lists.append(entii)   
            else:
                # print(entii)
                pass
    return  wordtochooo(sanyuanzu2(entity_lists))

def sanyuanzu2(entity_lists):
    entitysan_list = []
    for index, relation in  enumerate(entity_lists):
        for relation2 in entity_lists[index+1:]:
            # print(index, relation2)
            if relation[3] == relation2[3]:
                # print(relation[3])
                # pass
                if "1" in relation[-1]:
                    entitysan_list.append([relation[1],relation[2],relation[-1],relation2[2],relation2[1]])
                else:
                    entitysan_list.append([relation2[1],relation2[2],relation2[-1],relation[2],relation[1]])
                break #break 帮助在打标签时，能够断断续续多次标注。|匹配到最近的一对实体后，不再继续寻找。
    return entitysan_list


def wordtoch(words):
    # zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']
    # en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']
    # ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]

    zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]
    en = ['dis','hyp','oth','med','dia','cur',   "none",   'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]
    ti = ["A","B","C","D","E","F", "Q",  "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]
    dic1 = dict(zip(ti, zh)) 
    # dic2 = dict(zip(en,ti ))
    dic3 = dict(zip(ti,en))
    # print(len(zh))
    # print(len(en))
    # print(len(ti))
    
    for word in words:
        try:
            word[1] = dic1[word[1]]
            word[2] = dic1[word[2][0]]
            word[3] = dic1[word[3]]
        except:
            continue

    now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
    new_filename = file_name[:-5] + '_五元组_' +now_time +'.csv'
    filew = f = open(new_filename, 'w', encoding="utf-8")
    for word in words:
        # print(word)
        filew.write(str(word).strip('[').strip(']') +'\n')
    filew.close()
    print('5元组文件已导出！')
    return words

def wordtochooo(words):
    # zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']
    # en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']
    # ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]

    zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]
    en = ['dis','hyp','oth','med','dia','cur',   "none",   'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]
    ti = ["A","B","C","D","E","F", "Q",  "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]
    dic1 = dict(zip(ti, zh)) 
    # dic2 = dict(zip(en,ti ))
    dic3 = dict(zip(ti,en))
    # print(len(zh))
    # print(len(en))
    # print(len(ti))
    
    for word in words:
        try:
            word[1] = dic1[word[1]]
            word[2] = dic1[word[2][0]]
            word[3] = dic1[word[3]]
        except:
            continue
    return words


def readfile(file):
    f = open(file, "r", encoding='utf-8').readlines()
    entuty_list = []
    for i in f:
        # print(i.strip('\n'))
        j = i.strip('\n')
        # print(j)
        j = re.split(" |@|_", j)
        # print(j)
        entuty_list.append(j)
    # print(entuty_list)
   
    return entuty_list




#打标签
def tag_entity(word_list, label, schema='BIEO' ):
    """将实体字列表（word_list）中的每个字按照给定的模式（schema）打上
    对应的标签（label）

    :param word_list: 将实体词拆成单字组成的列表
    :param label: 实体对应的标签
    :param schema: 标注方法
    :return:
    """
    output_list = []
    list_len = len(word_list)
    if list_len == 1: #单字符
        if schema == 'BIEO':
            return word_list[0] + ' ' + 'B-' + label + '\n'
        else:  #'BI' 
            return word_list[0] + ' ' + 'B-' + label + '\n'
    else:
        if schema == 'BIEO':
            for idx in range(list_len):
                if idx == 0:
                    pair = word_list[idx] + ' ' + 'B-' + label + '\n'
                elif idx == list_len - 1:
                    pair = word_list[idx] + ' ' + 'E-' + label + '\n'
                else:
                    pair = word_list[idx] + ' ' + 'I-' + label + '\n'
                output_list.append(pair)

        else: #'BI'
            for idx in range(list_len):
                if idx == 0:
                    pair = word_list[idx] + ' ' + 'B-' + label + '\n'
                else:
                    pair = word_list[idx] + ' ' + 'I-' + label + '\n'
                output_list.append(pair)
    return output_list

def biaoqian(file_list):
    # zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']
    # en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']
    # ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]

    zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]
    en = ['dis','hyp','oth','med','dia','cur',   "none",   'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]
    ti = ["A","B","C","D","E","F", "Q",  "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]

    dic1 = dict(zip(ti, zh)) 
    # dic2 = dict(zip(en,ti ))
    dic3 = dict(zip(ti,en))

    for entii in file_list:
        if ("O" in entii) or (("O" not in entii) and (len(entii) == 2)) :
            klist = [k+' '+'O\n' for k in entii[0]]
            ms_list.append(klist) #存储字符和标签
        elif ("O" not in entii) and (len(entii) > 2):
            if len(entii) == 4:
                word_list = list(entii[0])
                label = dic3[entii[-1][0]] + '-' + entii[-1][-1]
                ms_list.append(tag_entity(word_list, label))
            elif len(entii) > 4:
                word_list = list(entii[0])
                label = "main-1"
                ms_list.append(tag_entity(word_list, label))
            else:
                pass
        else:
            pass
    

def writefile(ms_list):
    now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
    new_filename = file_name[:-5] + '_一对一_' +now_time +'.anns'
    f = open(new_filename, 'w', encoding='utf-8')
    for i in ms_list:
        for j in i:
            if '。' in j:
                f.write(j+'\n')
            else:
                f.write(j)
    f.close()
    print("已经输出ann文件！")

    
def Soooooda(entuty_list): # 句子分段，再逐一段传给下游任务

    juzilist = [] # 将同一个句子的元素放在一起
    juziid = 0 # 句子的id,防止同样的句子出现混乱
    result_list = [] #输出结果list
    juzi = ''

        
    for idx, entii in enumerate(entuty_list):
        aidx = len(entuty_list)  

        if (len(entii) == 1) and (len(entii[0])==0):
            juziid += 1
            a = sanyuanzu1ooo(juzilist)
            result_list.append([juziid, juzi, a])
            juzi = ''
            juzilist = []

        elif (len(entii) > 1) and ((idx+1) < aidx):
            juzi += entii[0]
            juzilist.append(entii)

        elif (idx+1) == aidx:
            juziid += 1
            juzi += entii[0]
            juzilist.append(entii)
            a = sanyuanzu1ooo(juzilist)
            result_list.append([juziid, juzi, a])
        else:
            pass
    # print("这儿")
    # print(result_list)
    return result_list
            
def write_Soooooda(result_list):
    now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
    new_filename = file_name[:-5] + '_7元组_' +now_time +'.csv'
    f = open(new_filename, 'w', encoding='utf-8')

    for i_list in result_list:
        if len(i_list[-1]) >= 1:
            for wordlist in i_list[-1]:
                f.write(str(i_list[0])+","+ i_list[1].replace(",","，") +","+ str(wordlist).strip('[').strip(']') +'\n')
        else:
            pass
    f.close()
    print("已输出7元组文件")
    return 
        
# '''
# s = [
#     {"text": "查尔斯·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部",
#         "new_spo_list": [
            
#             {"s": {"entity": "查尔斯·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}},

#              {"s": {"entity": "查尔斯·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}}
#         ]
#     },
#     {"text": "查尔斯1·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部",
#         "new_spo_list": 
#     }
# ]
# ''' 





import json

def spo(alis):
    # ['保质期长', '故障编号', '故障影响位置', '稳定要求', '经济价值']
    '''
    {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
     "p": {"entity": "出生地","type": "_rel"},
     "o": {"entity": "圣地亚哥","type": "property"}
    }
    '''

    stri = {"s": {"entity":alis[0], "type":alis[1]}, "p":{"entity":alis[2], "type": "_rel"}, "o":{"entity":alis[4],"type":alis[3]}}
    
    return stri


def new_spo_list(i_lista):

# '''
#  [
#     ['云南牛干巴', '指标', '具体要求为', '程度', '云南省回族人民'], 
#     ['传统干腌肉制品', '动作', '对应指标', '指标', '牛干巴'], 
#     ['保质期长', '故障编号', '故障影响位置', '动作', '易于携带'], 
#     ['保质期长', '故障编号', '故障影响位置', '稳定要求', '经济价值']
# ]
# '''

# [      
#             {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}},

#              {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}}
#         ]

    jj = []
    for ij in i_lista:
        jj.append(spo(ij))
    return jj


def Soooooda_json(Soooooda_list):
    result = Soooooda_list
    result_new = []
    list_wai = []

    for i_list in result:
        if len(i_list[-1]) >= 1:
            result_new.append(i_list)
        else:
            pass
    for i_lista in result_new: 
        list_u = {"text": i_lista[1], "new_spo_list": new_spo_list(i_lista[-1])}
        list_wai.append(list_u)

    now_time = datetime.now().strftime('%m-%d-%H-%M-%S')
    new_filename = file_name[:-5] + '_json_' +now_time +'.txt'
    f = open(new_filename, 'w', encoding='utf-8')

    wri = str(list_wai).replace("'", '"')
    f.write(wri)
    f.close()
    print("json-txt写入完成")
    return wri


if __name__ == "__main__":
    '''
    file_list = readfile(file_name)

    sanyuanzu1(file_list) #输出三元组-五元组文件
    '''
    schema="BIEO" 
    rep=r'\[<.*?\⊙'
    file_name = filea

    ms_list = []
    file_list = readfile(file_name)
    file_wlist = readfile(file_name)
    Soooooda_list = readfile(file_name)
    # print(file_list)
    
    

    if keytime == 1:
        sanyuanzu1(file_list) #输出三元组-五元组文件
    
    elif keytime == 2:
        biaoqian(file_wlist) # 付式 输出标签文件
        writefile(ms_list)
    
    elif keytime == 3:   # Soooooda新增功能：1
        write_Soooooda(Soooooda(Soooooda_list))

    elif keytime == 4:   # Soooooda新增功能：2
        Soooooda_json(Soooooda(Soooooda_list))
    
    else:
        print(f"请修正keytime={keytime}为1、2、3 或 4, 并重新运行程序")