#好多的数据集比如ccks2017,ccks2018等数据集只给出了整个的数据集并没有进行分割。而对于DiaKG数据官方给出的既包含实体信息也包含了关系抽取的信息,显得比较冗余,并且分割成立几十个json文件导致我们在使用的时候比较麻烦。这几天刚好想用这个数据集做一下医学实体识别的实验,就对该数据集进行了分割。有什么不足之处,或者实现错误的地方,还请多多指教!!!
import os
import json
from sklearn.model_selection import train_test_split
data_filename = "D:/Axial_Attention/dataset/0521_new_format"
all_data = []
all_label = []
#遍历文件下所有的文件使用的时os.listdir()
def data_process(file_name):
for filename in os.listdir(file_name):
#检查文件是否以json结尾
if filename.endswith('.json'):
file_path = os.path.join(data_filename,filename)
with open(file_path,'r',encoding='utf-8') as f:
data = json.load(f)
#data中是我们加载的json对象,从data中解析出我们需要的数据
for d in data['paragraphs']:
#句子 为了更加的严谨把保证句子和段落是一致的,我们会在在下面进行判断
sentence = d['sentences']
for s in sentence:
l = s['sentence']
all_data.append(l)
entity = s['entities']
# 遍历所有的实体信息去除掉关系抽取的部分
D = []
for e in entity:
start = e['start_idx']
end = e['end_idx'] - 1
label = e['entity_type']
D.append([start, end, label])
all_label.append(D)
return all_data, all_label
if __name__ == '__main__':
all_data,all_label = data_process(data_filename)
with open('../alldata.json','w',encoding='utf-8') as fl:
for sentence, labels in zip(all_data, all_label):
json.dump({'sentence':sentence,'label':labels},fl,ensure_ascii=False)
fl.write('\n')
上面的代码时进行数据的整理,从原始的数据集中抽取做实体识别的数据集并进行保存。
import json
from sklearn.model_selection import train_test_split
# 假设data是一个包含所有数据的列表,每个元素是一个字典
# 这里我们使用文档中的内容作为示例
with open('../alldata.json',encoding='utf-8') as f:
s = []
e = []
for line in f:
line = json.loads(line)
s.append(line["sentence"])
e.append(line['label'])
# 将数据分为特征和标签
# 假设每个文档的特征是其原始文本,标签是其所属的类别
# 随机划分数据集
X_train, X_temp, y_train, y_temp = train_test_split(s, e, test_size=0.3, random_state=42)
train_data = [{'text': text, 'labels': entities} for text, entities in zip(X_train, y_train)]
X_dev,X_test,y_dev,y_test = train_test_split(X_temp,y_temp,test_size=0.5,random_state=42)
dev_data = [{'text': text, 'labels': entities} for text, entities in zip(X_dev, y_dev)]
test_data = [{'text': text, 'labels': entities} for text, entities in zip(X_test, y_test)]
# with open('train_data.json', 'w',encoding='utf-8') as f:
# json.dump(train_data, f)
with open('../train.json', "w", encoding="utf-8") as fw: # 打开指定文件
for line in train_data:
json.dump(line,fw,ensure_ascii=False)
fw.write('\n')
with open('../dev.json', "w", encoding="utf-8") as f: # 打开指定文件
for line in dev_data:
json.dump(line,f,ensure_ascii=False)
f.write('\n')
with open('../test.json', "w", encoding="utf-8") as f: # 打开指定文件
for line in test_data:
json.dump(line,f,ensure_ascii=False)
f.write('\n')
上面的代码时使用sklearn库将数据集分成了训练集,测试集和验证集。