python dataset[trans_TestDatasetTF.py

该博客介绍了如何使用Python加载和处理知识图谱的数据,包括实体和关系字典的加载,以及训练、验证和测试三元组的构建。通过读取txt文件并将ID与实体和关系对应,为后续的知识图谱任务做准备。
摘要由CSDN通过智能技术生成

import os

import pandas as pd

class KnowledgeGraph:

def __init__(self, data_dir):

# 考虑到tf的各项api使用,Python不能将Tensor类型直接转换成字符串类型,但是可以将TF类型转换成numpy类型

# 所以这里的训练三元组,测试三元组等等,都是id三元组,而不是字符串三元组

self.data_dir = data_dir

self.entity_dict = {}

self.entities = []

self.relation_dict = {}

self.n_entity = 0

self.n_relation = 0

self.training_triples = [] # list of triples in the form of (h, t, r)

self.validation_triples = []

self.test_triples = []

self.n_training_triple = 0

self.n_validation_triple = 0

self.n_test_triple = 0

'''load dicts and triples'''

self.load_dicts()

self.load_triples()

'''construct pools after loading'''

self.training_triple_pool = set(self.training_triples)

self.golden_triple_pool = set(

self.training_triples) | set(

self.validation_triples) | set(

self.test_triples)

def load_dicts(self):

entity_dict_file = 'entity2id.txt'

relation_dict_file = 'relation2id.txt'

print('-----Loading entity dict-----')

entity_df = pd.read_table(

os.path.join(

self.data_dir,

entity_dict_file),

header=None)

self.entity_dict = dict(zip(entity_df[0], entity_df[1]))

self.n_entity = len(self.entity_dict)

self.entities = list(self.entity_dict.values())

print('#entity: {}'.format(self.n_entity))

print('-----Loading relation dict-----')

relation_df = pd.read_table(

os.path.join(

self.data_dir,

relation_dict_file),

header=None)

self.relation_dict = dict(zip(relation_df[0], relation_df[1]))

self.n_relation = len(self.relation_dict)

print('#relation: {}'.format(self.n_relation))

def load_triples(self):

training_file = 'train.txt'

validation_file = 'valid.txt'

test_file = 'test.txt'

print('-----Loading training triples-----')

training_df = pd.read_table(

os.path.join(

self.data_dir,

training_file),

header=None)

self.training_triples = list(zip([self.entity_dict[h] for h in training_df[0]],

[self.entity_dict[t] for t in training_df[1]],

[self.relation_dict[r] for r in training_df[2]]))

self.n_training_triple = len(self.training_triples)

print('#training triple: {}'.format(self.n_training_triple))

print('-----Loading validation triples-----')

validation_df = pd.read_table(

os.path.join(

self.data_dir,

validation_file),

header=None)

self.validation_triples = list(zip([self.entity_dict[h] for h in validation_df[0]],

[self.entity_dict[t] for t in validation_df[1]],

[self.relation_dict[r] for r in validation_df[2]]))

self.n_validation_triple = len(self.validation_triples)

print('#validation triple: {}'.format(self.n_validation_triple))

print('-----Loading test triples------')

test_df = pd.read_table(

os.path.join(

self.data_dir,

test_file),

header=None)

self.test_triples = list(zip([self.entity_dict[h] for h in test_df[0]],

[self.entity_dict[t] for t in test_df[1]],

[self.relation_dict[r] for r in test_df[2]]))

self.n_test_triple = len(self.test_triples)

print('#test triple: {}'.format(self.n_test_triple))

一键复制

编辑

Web IDE

原始数据

按行查看

历史

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值