'''处理数据集'''
# with open('train_data.txt', 'r') as f:
# train_x = f.readlines()
# with open('train_label.txt', 'r') as f:
# train_y = f.readlines()
#
# with open('data.txt', 'w') as f:
# for index_x, x in enumerate(train_x):
# for index_word, word in enumerate(x.split()):
# word_label = train_y[index_x].split()[index_word]
# f.write('{}\t{}\n'.format(word, word_label))
# f.write('\n')
'''读取数据1'''
'''数据集格式如下
牙 B-ORG
买 I-ORG
加 I-ORG
队 I-ORG
教 O
练 O
西 B-PER
蒙 I-PER
斯 I-PER
'''
# with open("dataset/data.txt", "rb") as f:
# data = f.read().decode("utf-8")
# train_data = data.split("\n\n")
# train_data = [token.split("\n") for token in train_data]
# train_data = [[j.split() for j in i] for i in train_data]
# train_data.pop()
#
# train_x = [[token[0] for token in sen] for sen in train_data]
# train_y = [[token[1] for token in sen] for sen in train_data]
'''读取数据集2'''
with open("train_data.txt", "r", encoding='utf-8') as f:
data_lines = f.readlines()
with open("train_label.txt", "r", encoding='utf-8') as f:
label_lines = f.readlines()
data, label = [], []
for index, sen in enumerate(data_lines):
data.append(sen.split())
label.append(label_lines[index].split())
两种NER数据集读取
最新推荐文章于 2023-04-24 20:52:30 发布