一. 这个实例是利用哈工大的自然语言 处理平台里的一些工具,加上python对于字符串的处理 来对数据进行处理的,本人用作笔记之用,如果也 能对你有用,那 当然是乐意之至。
import sys
import os
import re
from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
ROOTDIR = 'D:/Users/liang/PycharmProjects/analysisCase/'
DATA = os.path.join(ROOTDIR, 'data/')
MODEL = os.path.join(ROOTDIR, 'model/ltp_data_v3.4.0/')
RESULT = os.path.join(ROOTDIR, 'result/')
# 名字转换器
def get_seg(filename):
patter = '\d+'
return 'seg-{}.txt'.format(re.findall(patter, filename)[0])
def get_postag(filename):
patter = "\d+"
return 'postag-{}.txt'.format(re.findall(patter, filename)[0])
def get_txt(filename):
return filename.replace('.txtoriginal', '')
def get_train(filename):
return filename.replace('.txtoriginal', '_train_data')
# 文件路径 结果路径 文件名
def process_pre(data_dir, result_dir, result_dir2, filename):
f_original = open(os.path.join(data_dir, filename), 'r', encoding='UTF-8')
data_original = f_original.readlines()
str1 = ''
str2 = ''
for aaa in data_original[0:]:
# 分词
segmentor = Segmentor()
segmentor.load_with_lexicon(os.path.join(MODEL, 'cws.model'), filename)
words = segmentor.segment(aaa)
segment123 = list(words)
for x in segment123[0:]:
str1 += '\t' + x
str1 += '\n'
# 标注
postagger = Postagger()
postagger.load(os.path.join(MODEL, 'pos.model'))
postags = postagger.postag(words)
postag123 = list(postags)
for x in postag123[0:]:
str2 += '\t' + x
str2 += '\n'
segmentor.release()
postagger.release()
segfilename = get_seg(filename)
postagfilename = get_postag(filename)
f1 = open(os.path.join(result_dir, segfilename), 'w+', encoding='UTF-8')
f2 = open(os.path.join(result_dir, postagfilename), 'w', encoding='UTF-8')
f1.write(str1)
f2.write(str2)
f1.close()
f2.close()
f_original.close()
# 数据标注杂糅
# 标注好的数据文件名
filename2 = get_txt(filename)
# 存放训练数据文件名
filetrain = get_train(filename)
# 打开标好的文件
f2 = open(os.path.join(data_dir, filename2), 'r', encoding='UTF-8')
# 打开整合好的文件
filetrain = open(os.path.join(result_dir2, filetrain), 'w+', encoding='UTF-8')
# 1.标注好的文件
data1 = f2.readlines()
# 1.整合的数据
data_combine = ""
# 1.原始数据
f_original = open(os.path.join(data_dir, filename), 'r', encoding='UTF-8')
data2 = f_original.read()
# 处理好的数据
data_segment = str1.split('\t')
data_postag = str2.split('\t')
l = len(data_segment)
for i, data_s in enumerate(data_segment[0:]):
flag = 0
if (i == 0):
continue
if (i != l - 1):
data_combine += data_s + '/' + data_postag[data_segment.index(data_s)]
for ner in data1[0:]:
# data_split
# ['头晕','39','40','症状和体征\n']
data_split = ner.split('\t')
if data_s == data2[int(data_split[1]):int(data_split[2]) + 1]:
flag = 1
if data_split[3] == '症状和体征\n' or data_split[3] == '症状和体征':
data_combine += '#S-Nss '
break
elif data_split[3] == '检查和检验\n' or data_split[3] == '检查和检验':
data_combine += '#S-Nii '
break
elif data_split[3] == '疾病和诊断\n' or data_split[3] == '疾病和诊断':
data_combine += '#S-Ndd '
break
elif data_split[3] == '治疗\n' or data_split[3] == '治疗':
data_combine += '#S-Nt '
break
elif data_split[3] == '身体部位\n' or data_split[3] == '身体部位':
data_combine += '#S-Npb '
break
else:
break
if (flag == 0) and (i != l - 1):
data_combine += '#O '
else:
continue
filetrain.write(data_combine)
filetrain.close()
f_original.close()
# filesegment.close()
# filepostag.close()
if __name__ == '__main__':
filedir = DATA + '{}/'
resultdir = RESULT + "one/{}/"
resultdir2 = RESULT + "two/{}/"
# process_pre(filedir, resultdir, resultdir2,filename)
for root, dirs, files in os.walk(DATA):
print(root)
for item in files[1::2]:
# print(root.split('/')[-1], item)
rootsplit = root.split('/')[-1]
one = filedir.format(rootsplit) #每个文件夹数据路径
two = resultdir.format(rootsplit)#结果1路径
three = resultdir2.format(rootsplit)#结果2路径
process_pre(one, two, three, item) #主要在于传哪些参数,数据路径,结果1和2路径,文件名