python处理sohu数据集的xml文件,并转为NER任务的输入形式:
import xml.dom.minidom as dom
from sklearn import model_selection
def statistics():
data_path='../data/sohu/sohu.txt'
length=[]
with open(data_path, 'r', encoding='utf-8') as fp:
for line in fp.readlines():
content, keywords = line.split('\t')
length.append(len(content))
print(max(length),min(length))
def convert_xml2txt():
xmlPath = '../data/sohu-dataset/'
txtPath = '../data/sohu/'
with open(txtPath + 'sohu.txt', 'w', encoding='utf-8') as fp:
for i in range(1, 1001):
length = len(str(i))
name = xmlPath + '0' * (5 - length) + str(i) + '.xml'
domTree = dom.parse(name)
data = domTree.documentElement
keywords = data.getElementsByTagName('keywords')[0].firstChild.data
content = data.getElementsByTagName('content')[0].firstChild.data.replace('\n', '').replace(' ','')\
.replace('\t', '')
fp.write(content + '\t' + keywords + '\n')
def deal_dataset_for_nlp():
data_path = '../data/sohu/sohu.txt'
total_data = []
output_path = 'sohu/'
def findAll(substr, str):
result = []
index = 0
while str.find(substr, index, len(str)) != -1:
temIndex = str.find(substr, index, len(str))
result.append(temIndex)
index = temIndex + 1
return result
with open(data_path, 'r', encoding='utf-8') as fp:
for line in fp.readlines():
content, keywords = line.split('\t')
tag_list = ['O'] * len(content)
keywords = keywords.split(' ')
keywords.sort() # 对于嵌套关键词,例如(强奸,强奸犯),长关键词优先级更高
for keyword in keywords:
keyword = keyword.strip()
if keyword != '':
length = len(keyword)
begin_indexs = findAll(keyword, content)
for begin_index in begin_indexs:
if len(keyword)==1:
tag_list[begin_index]='S-Sensitive'
else:
tag_list[begin_index] = 'B-Sensitive'
for i in range(1, length - 1):
tag_list[begin_index + i] = 'M-Sensitive'
tag_list[begin_index + length - 1] = 'E-Sensitive'
total_data.append({"content": content, "tag": tag_list})
# 按8:1:1划分train,dev,test
train_dev, test = model_selection.train_test_split(total_data, test_size=1 / 10, random_state=0)
train, dev = model_selection.train_test_split(train_dev, test_size=1 / 9, random_state=0)
print('total size: ' + str(len(total_data)) + ' train size: ' + str(len(train)) + ' dev size: ' + str(
len(dev)) + ' test size: ' + str(len(test)))
def write_data_for_nlp(path, data):
with open(path, 'w', encoding='utf-8') as writer:
for line in data:
content = line['content']
tag_list = line['tag']
for i in range(len(content)):
writer.write(content[i] + ' ' + tag_list[i] + '\n')
writer.write('\n')
write_data_for_nlp(output_path + 'train.txt', train)
write_data_for_nlp(output_path + 'dev.txt', dev)
write_data_for_nlp(output_path + 'test.txt', test)
# convert_xml2txt()
# deal_dataset_for_nlp()
statistics()