1.将json数据按行读取到列表中
def loaddata(path):
jsonlist = []
with open(path,encoding='utf-8') as file:
for line in file.readlines():
line = json.loads(line) # 将line数据格式化为字典
line = json.dumps(line, ensure_ascii=False) # 将字典转化为字符串,ensure_ascii=False用于处理中文字符
jsonlist.append(line)
return jsonlist
2.分割列表中的数据为训练集和测试集
def datasplit(data, num_train):
train = data[:num_train]
dev = data[num_train:]
return train, dev
3.json数据集格式转换
def json_conversion(file_path):
json_sentences = []
with open(file=file_path, encoding='utf-8') as f:
data = f.readlines() # 读取所有行数据
for record in tqdm(data, total=len(data)): # 遍历得到每行数据
enity_relation_list = []
record = record.strip('\n') # 删除结尾的换行符
record = json.loads(record) # 转换为字典格式
text = record['text']
spo_list = record['spo_list']
for spo in spo_list: # 遍历三元组列表
predicate = spo['predicate']
object = spo['object']['@value']
subject = spo['subject']
enity_relation = {"em1Text": subject, "em2Text": object, "label": predicate}
enity_relation_list.append(enity_relation)
json_text = {"sentText": text, "relationMentions": enity_relation_list} # 按字典格式重新保存
json_text = json.dumps(json_text, ensure_ascii=False) # 将字典格式转换为字符串格式,ensure_ascii=False用于处理中文字符
json_sentences.append(json_text)
return json_sentences
4.将列表中的数据写入.txt文件中
def write_data(path, data):
with open(path, "w", encoding="utf-8") as fw:
for ele in tqdm(data, total=len(data)):
fw.write(str(ele) + "\n")