裁判文书关键信息提取

 项目初期的部分代码,比较粗糙,随便看看就行了,效果不太好,且我后面又改了很多。

2021年11月8日

"""单个文件等级地处理"""
# 程序实现:1.传入一个路径(doc或docx文件路径,系统会自动判断是doc文件还是docx文件,doc文件会转换为docx文件),在本路径下生成同文件名json文件。2.传入一个路径,返回一个列表(内嵌套固定类型字典)。

import os
import docx
import json
from process_doc import doc2docx_1


# 通过文件路径解析信息(txt文件编码类型为ANSI),获取案件名(文件名)、案由、案件类型、文书类型
def case_name(filepath, anyoutxt, leixingtxt, papertypetxt, casetypetxt):
    # 获取案件名
    (path, filename) = os.path.split(filepath)
    (casename, ext) = os.path.splitext(filename)
    # 获取案由
    anyou = ''
    f01 = open(anyoutxt, "r")  # 设置文件对象,其中txt文件编码类型为ANSI
    data01 = f01.readlines()  # 直接将文件中按行读到list里,效果与方法2一样
    f01.close()  # 关闭文件
    for ay in data01:
        ay = ay.replace('\n', '')  # 删除回车
        if ay in casename:
            anyou = ay
    # 获取审理程序
    chengxu = ''
    f02 = open(leixingtxt, "r")  # 设置文件对象
    data02 = f02.readlines()  # 直接将文件中按行读到list里,效果与方法2一样
    f02.close()  # 关闭文件
    for ay in data02:
        ay = ay.replace('\n', '')  # 删除回车
        if ay in casename:
            chengxu = ay
    # 获取文书类型
    papertype = ''
    f03 = open(papertypetxt, "r")  # 设置文件对象
    data03 = f03.readlines()  # 直接将文件中按行读到list里,效果与方法2一样
    f03.close()  # 关闭文件
    for ay in data03:
        ay = ay.replace('\n', '')  # 删除回车
        if ay in casename:
            papertype = ay
    # 获取诉讼程序
    casetype = ''
    f04 = open(casetypetxt, "r")  # 设置文件对象
    data04 = f04.readlines()  # 直接将文件中按行读到list里,效果与方法2一样
    f04.close()  # 关闭文件
    for ay in data04:
        ay = ay.replace('\n', '')  # 删除回车
        if ay in casename:
            casetype = ay
    if casename =='':
        print('未找到案件名')
        casename = 'void'
    if anyou == '':
        print("未找到案由!")
        anyou = 'void'
    if chengxu == '':
        print("匹配审理程序失败!")
        chengxu = 'void'
    if papertype == '':
        print("解析文书类型失败!")
        papertype = 'void'
    if casetype == '':
        print("解析案件类型失败!")
        casetype = 'void'
    return (casename, anyou, chengxu, papertype, casetype) # 后两项仅用于传入case_base_information函数


# 读取docx文档,并将每段存入一个列表
def read_docx(path):
    if not os.path.exists(path):
        print('发生错误:\n'+path+'文件不存在!')
        exit(0)  # 无错误结束程序
    doc = docx.Document(path)
    paras_temp = []  # 用来存储段落
    for p in doc.paragraphs:
        p1 = p.text.replace(" ", "")  # 去除空格,并以字符串的形式存储在列表中
        paras_temp.append(p1)
    print(path + ":  文件读取成功")
    return paras_temp


# 传入读取的文件段落列表,获取案件基本信息:法院,文书性质,案号,判决日期
def case_base_information(paras, papertype, casetype, papertypetxt, casetypetxt):
    case_court = paras[0]
    type = paras[1]
    case_number = paras[2]
    # 变量名分配
    prosecution_organ = ''   # 公诉机关
    part_people_index0 = 0   # 参与人第一段索引(初始化
    part_people_index1 = 0   # 参与人最后一段索引(初始化
    law_index0 = 0  # 相关法律初始段(初始化
    defendants = [] # 被告人
    law_relate = []
    # 获取文书类型
    if papertype == 'void':
        f03 = open(papertypetxt, "r")  # 设置文件对象
        data03 = f03.readlines()  # 直接将文件中按行读到list里,效果与方法2一样
        f03.close()  # 关闭文件
        for ay in data03:
            ay = ay.replace('\n', '')  # 删除回车
            if ay in type:
                papertype = ay
    # 获取诉讼程序
    if casetype == 'void':
        f04 = open(casetypetxt, "r")  # 设置文件对象
        data04 = f04.readlines()  # 直接将文件中按行读到list里,效果与方法2一样
        f04.close()  # 关闭文件
        for ay in data04:
            ay = ay.replace('\n', '')  # 删除回车
            if ay in type:
                casetype = ay
    # 遍历检索
    judge_day = 'void'
    n1 = 1
    for p in paras:
        # 提取公诉机关(检察院)
        if '公诉机关' in p:
            if len(prosecution_organ) <= 0:
                prosecution_organ = p.replace('公诉机关','')
                prosecution_organ = prosecution_organ.replace('。','')
                part_people_index0 = paras.index(p) + 1
                n1 += 1
        # 案由段,暂时未提取!!!!!!!
        elif '已审理终结' in p:
            anyou = p
        if '年' and '月' and '日' in p:
            if len(p) <= 12:
                judge_day = p
        # 相关法律条文起始段索引
        if '法律条文' in p:
            law_index0 = paras.index(p)
        if n1 ==2 and (prosecution_organ in p):
            part_people_index1 = paras.index(p) - 1
    # 提取被告人信息(包括辩护人)
    while part_people_index0 <= part_people_index1:
        defendants.append(paras[part_people_index0])
        part_people_index0 += 1
    # 相关法律条文
    if law_index0 > 0:
        while law_index0 < len(paras):
            law_relate.append(paras[law_index0])
            law_index0 += 1
    if prosecution_organ == '':
        print('未提取到公诉机关')
        prosecution_organ = 'void'
    if defendants == []:
        print('未提取到被告人信息')
        defendants.append('void')
    if law_relate == []:
        print('文书未附相关法律条文')
        law_relate.append('void')
    if judge_day == 'void':
        print('提取判决日期失败')
    return (papertype, casetype, case_court, case_number, judge_day, defendants, prosecution_organ, law_relate)


# 将信息存入字典(在此设置字典)(暂时不用)
def information_dict(casename, anyou, chengxu, papertype, casetype, fayuan, wenshuleixing, anhao, panjueriqi, beigaoren, gongsuren):
    info_dict_temp = {'案件名': casename,  '案号':anhao, '案由':anyou, '程序': chengxu, '文书类型':papertype,
                      '案件类型': casetype, '法院':fayuan, '文书类型':wenshuleixing,
                      '判决日期': panjueriqi, '被告人':beigaoren, '公诉人':gongsuren
                      }
    return info_dict_temp


# 将信息存入列表(内嵌套字典)
def information_list(casename, anyou, chengxu, papertype, anjianleixing,
                        fayuan, anhao, panjueriqi, beigaoren, gongsuren, law_ralated):
    info_list = [
        {'案件名': casename}, {'案由': anyou}, {'程序': chengxu},
        {'文书类型': papertype}, {'案件类型': anjianleixing},
        {'法院': fayuan}, {'案号':anhao},
        { '判决日期': panjueriqi}, {'被告人': beigaoren}, { '公诉人':gongsuren}, {'相关法律': law_ralated}
    ]
    return info_list


# 字典转json文件(savefile_load是文件夹路径)
def dict2json(info_dict, savefile_load, casename):
    b = json.dumps(info_dict, ensure_ascii=False)
    savefile_load = savefile_load + '/' + casename + '.json'
    f2 = open(savefile_load, 'w')
    f2.write(b)
    f2.close()


def list_output(file_path):
    if os.path.splitext(file_path)[-1] == ".doc":  # 判断文件类型是否为doc
        print('传入文件格式为doc,将转换为docx格式。')
        doc2docx_1(file_path)
        file_path = file_path + 'x'
    # 设置引用文件路径(因为后期可能要更新相应信息,所以采用此种方式更方便一些)
    anyoutxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/anyou.txt'
    leixingtxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/chengxu.txt'
    papertypetxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/paper_type.txt'
    casetypetxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/case_type.txt'

    (casename, anyou, chengxu, papertype0, casetype0) = \
        case_name(file_path, anyoutxt, leixingtxt, papertypetxt, casetypetxt)

    paras = read_docx(file_path)  # 读取文书段落

    (papertype, casetype, fayuan, anhao, panjueriqi, beigaoren, gongsuren, law_ralated) \
        = case_base_information(paras, papertype0, casetype0, papertypetxt, casetypetxt)  # 读取基本信息

    info_list = information_list(casename, anyou, chengxu, papertype, casetype, fayuan, anhao,
                                 panjueriqi, beigaoren, gongsuren, law_ralated)
    return info_list


if __name__ == '__main__':
    path = 'E:/NLP/doc2docx_test/王云清应先勇等强迫交易罪非法占用农用地罪夏君兵王某29强迫交易罪一审刑事判决书.doc'
    info_list = list_output(path)
    for i in info_list:
        print(i)

以下为文件内容举例。

anyou.txt

重大环境污染事故罪
污染环境罪
非法处置进口的固体废物罪
擅自进口固体废物罪
非法捕捞水产品罪
非法猎捕、杀害珍贵、濒危野生动物罪
非法收购、运输、出售珍贵、濒危野生动物、珍贵、濒危野生动物制品罪
非法狩猎罪
非法占用耕地罪
非法占用农用地罪
非法采矿罪
破坏性采矿罪
非法采伐、毁坏国家重点保护植物罪
非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品罪
非法采伐、毁坏珍贵树木罪
盗伐林木罪
非法收购、运输盗伐、滥伐的林木罪
非法收购盗伐、滥伐的林木罪

paper_type.txt

判决书
裁定书
调解书
决定书
通知书

代码补充:

# 实现单个doc文件转换为docx文件
def doc2docx_1(doc_path):
    # 打开word并设置
    word = wc.Dispatch("Word.Application")
    word.Visible = 0  # 1程序可见,0不可见
    print(doc_path)
    doc = word.Documents.Open(doc_path)
    docx_save_path = doc_path + 'x'
    doc.SaveAs(docx_save_path, 12, False, "", True, "", False, False, False, False)  # 转换后的文件,12代表转换后为docx文件
    doc.Close()
    word.Quit()

  • 4
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 10
    评论
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值