用Python进行文件批处理实例

一. 这个实例是利用哈工大的自然语言 处理平台里的一些工具,加上python对于字符串的处理 来对数据进行处理的,本人用作笔记之用,如果也 能对你有用,那 当然是乐意之至。

import sys
import os
import re
from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

ROOTDIR = 'D:/Users/liang/PycharmProjects/analysisCase/'

DATA = os.path.join(ROOTDIR, 'data/')
MODEL = os.path.join(ROOTDIR, 'model/ltp_data_v3.4.0/')
RESULT = os.path.join(ROOTDIR, 'result/')


# 名字转换器
def get_seg(filename):
    patter = '\d+'
    return 'seg-{}.txt'.format(re.findall(patter, filename)[0])


def get_postag(filename):
    patter = "\d+"
    return 'postag-{}.txt'.format(re.findall(patter, filename)[0])


def get_txt(filename):
    return filename.replace('.txtoriginal', '')


def get_train(filename):
    return filename.replace('.txtoriginal', '_train_data')


# 文件路径 结果路径 文件名
def process_pre(data_dir, result_dir, result_dir2, filename):
    f_original = open(os.path.join(data_dir, filename), 'r', encoding='UTF-8')
    data_original = f_original.readlines()
    str1 = ''
    str2 = ''
    for aaa in data_original[0:]:
        # 分词
        segmentor = Segmentor()
        segmentor.load_with_lexicon(os.path.join(MODEL, 'cws.model'), filename)
        words = segmentor.segment(aaa)
        segment123 = list(words)
        for x in segment123[0:]:
            str1 += '\t' + x
        str1 += '\n'
        # 标注
        postagger = Postagger()
        postagger.load(os.path.join(MODEL, 'pos.model'))
        postags = postagger.postag(words)
        postag123 = list(postags)
        for x in postag123[0:]:
            str2 += '\t' + x
        str2 += '\n'
        segmentor.release()
        postagger.release()
    segfilename = get_seg(filename)
    postagfilename = get_postag(filename)
    f1 = open(os.path.join(result_dir, segfilename), 'w+', encoding='UTF-8')
    f2 = open(os.path.join(result_dir, postagfilename), 'w', encoding='UTF-8')
    f1.write(str1)
    f2.write(str2)
    f1.close()
    f2.close()
    f_original.close()
    # 数据标注杂糅
    # 标注好的数据文件名
    filename2 = get_txt(filename)
    # 存放训练数据文件名
    filetrain = get_train(filename)
    # 打开标好的文件
    f2 = open(os.path.join(data_dir, filename2), 'r', encoding='UTF-8')
    # 打开整合好的文件
    filetrain = open(os.path.join(result_dir2, filetrain), 'w+', encoding='UTF-8')
    # 1.标注好的文件
    data1 = f2.readlines()
    # 1.整合的数据
    data_combine = ""
    # 1.原始数据
    f_original = open(os.path.join(data_dir, filename), 'r', encoding='UTF-8')
    data2 = f_original.read()
    # 处理好的数据
    data_segment = str1.split('\t')
    data_postag = str2.split('\t')
    l = len(data_segment)
    for i, data_s in enumerate(data_segment[0:]):
        flag = 0
        if (i == 0):
            continue
        if (i != l - 1):
            data_combine += data_s + '/' + data_postag[data_segment.index(data_s)]
        for ner in data1[0:]:
            # data_split
            # ['头晕','39','40','症状和体征\n']
            data_split = ner.split('\t')
            if data_s == data2[int(data_split[1]):int(data_split[2]) + 1]:
                flag = 1
                if data_split[3] == '症状和体征\n' or data_split[3] == '症状和体征':
                    data_combine += '#S-Nss '
                    break
                elif data_split[3] == '检查和检验\n' or data_split[3] == '检查和检验':
                    data_combine += '#S-Nii '
                    break
                elif data_split[3] == '疾病和诊断\n' or data_split[3] == '疾病和诊断':
                    data_combine += '#S-Ndd '
                    break
                elif data_split[3] == '治疗\n' or data_split[3] == '治疗':
                    data_combine += '#S-Nt '
                    break
                elif data_split[3] == '身体部位\n' or data_split[3] == '身体部位':
                    data_combine += '#S-Npb '
                    break
                else:
                    break
        if (flag == 0) and (i != l - 1):
            data_combine += '#O '
        else:
            continue
    filetrain.write(data_combine)
    filetrain.close()
    f_original.close()
    # filesegment.close()
    # filepostag.close()


if __name__ == '__main__':
    filedir = DATA + '{}/'
    resultdir = RESULT + "one/{}/"
    resultdir2 = RESULT + "two/{}/"
    # process_pre(filedir, resultdir, resultdir2,filename)

    for root, dirs, files in os.walk(DATA):
        print(root)
        for item in files[1::2]:
            # print(root.split('/')[-1], item)
            rootsplit = root.split('/')[-1]
            one = filedir.format(rootsplit) #每个文件夹数据路径
            two = resultdir.format(rootsplit)#结果1路径
            three = resultdir2.format(rootsplit)#结果2路径
            process_pre(one, two, three, item) #主要在于传哪些参数,数据路径,结果1和2路径,文件名

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值