项目初期的部分代码,比较粗糙,随便看看就行了,效果不太好,且我后面又改了很多。
2021年11月8日
"""单个文件等级地处理"""
# 程序实现:1.传入一个路径(doc或docx文件路径,系统会自动判断是doc文件还是docx文件,doc文件会转换为docx文件),在本路径下生成同文件名json文件。2.传入一个路径,返回一个列表(内嵌套固定类型字典)。
import os
import docx
import json
from process_doc import doc2docx_1
# 通过文件路径解析信息(txt文件编码类型为ANSI),获取案件名(文件名)、案由、案件类型、文书类型
def case_name(filepath, anyoutxt, leixingtxt, papertypetxt, casetypetxt):
# 获取案件名
(path, filename) = os.path.split(filepath)
(casename, ext) = os.path.splitext(filename)
# 获取案由
anyou = ''
f01 = open(anyoutxt, "r") # 设置文件对象,其中txt文件编码类型为ANSI
data01 = f01.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f01.close() # 关闭文件
for ay in data01:
ay = ay.replace('\n', '') # 删除回车
if ay in casename:
anyou = ay
# 获取审理程序
chengxu = ''
f02 = open(leixingtxt, "r") # 设置文件对象
data02 = f02.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f02.close() # 关闭文件
for ay in data02:
ay = ay.replace('\n', '') # 删除回车
if ay in casename:
chengxu = ay
# 获取文书类型
papertype = ''
f03 = open(papertypetxt, "r") # 设置文件对象
data03 = f03.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f03.close() # 关闭文件
for ay in data03:
ay = ay.replace('\n', '') # 删除回车
if ay in casename:
papertype = ay
# 获取诉讼程序
casetype = ''
f04 = open(casetypetxt, "r") # 设置文件对象
data04 = f04.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f04.close() # 关闭文件
for ay in data04:
ay = ay.replace('\n', '') # 删除回车
if ay in casename:
casetype = ay
if casename =='':
print('未找到案件名')
casename = 'void'
if anyou == '':
print("未找到案由!")
anyou = 'void'
if chengxu == '':
print("匹配审理程序失败!")
chengxu = 'void'
if papertype == '':
print("解析文书类型失败!")
papertype = 'void'
if casetype == '':
print("解析案件类型失败!")
casetype = 'void'
return (casename, anyou, chengxu, papertype, casetype) # 后两项仅用于传入case_base_information函数
# 读取docx文档,并将每段存入一个列表
def read_docx(path):
if not os.path.exists(path):
print('发生错误:\n'+path+'文件不存在!')
exit(0) # 无错误结束程序
doc = docx.Document(path)
paras_temp = [] # 用来存储段落
for p in doc.paragraphs:
p1 = p.text.replace(" ", "") # 去除空格,并以字符串的形式存储在列表中
paras_temp.append(p1)
print(path + ": 文件读取成功")
return paras_temp
# 传入读取的文件段落列表,获取案件基本信息:法院,文书性质,案号,判决日期
def case_base_information(paras, papertype, casetype, papertypetxt, casetypetxt):
case_court = paras[0]
type = paras[1]
case_number = paras[2]
# 变量名分配
prosecution_organ = '' # 公诉机关
part_people_index0 = 0 # 参与人第一段索引(初始化
part_people_index1 = 0 # 参与人最后一段索引(初始化
law_index0 = 0 # 相关法律初始段(初始化
defendants = [] # 被告人
law_relate = []
# 获取文书类型
if papertype == 'void':
f03 = open(papertypetxt, "r") # 设置文件对象
data03 = f03.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f03.close() # 关闭文件
for ay in data03:
ay = ay.replace('\n', '') # 删除回车
if ay in type:
papertype = ay
# 获取诉讼程序
if casetype == 'void':
f04 = open(casetypetxt, "r") # 设置文件对象
data04 = f04.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f04.close() # 关闭文件
for ay in data04:
ay = ay.replace('\n', '') # 删除回车
if ay in type:
casetype = ay
# 遍历检索
judge_day = 'void'
n1 = 1
for p in paras:
# 提取公诉机关(检察院)
if '公诉机关' in p:
if len(prosecution_organ) <= 0:
prosecution_organ = p.replace('公诉机关','')
prosecution_organ = prosecution_organ.replace('。','')
part_people_index0 = paras.index(p) + 1
n1 += 1
# 案由段,暂时未提取!!!!!!!
elif '已审理终结' in p:
anyou = p
if '年' and '月' and '日' in p:
if len(p) <= 12:
judge_day = p
# 相关法律条文起始段索引
if '法律条文' in p:
law_index0 = paras.index(p)
if n1 ==2 and (prosecution_organ in p):
part_people_index1 = paras.index(p) - 1
# 提取被告人信息(包括辩护人)
while part_people_index0 <= part_people_index1:
defendants.append(paras[part_people_index0])
part_people_index0 += 1
# 相关法律条文
if law_index0 > 0:
while law_index0 < len(paras):
law_relate.append(paras[law_index0])
law_index0 += 1
if prosecution_organ == '':
print('未提取到公诉机关')
prosecution_organ = 'void'
if defendants == []:
print('未提取到被告人信息')
defendants.append('void')
if law_relate == []:
print('文书未附相关法律条文')
law_relate.append('void')
if judge_day == 'void':
print('提取判决日期失败')
return (papertype, casetype, case_court, case_number, judge_day, defendants, prosecution_organ, law_relate)
# 将信息存入字典(在此设置字典)(暂时不用)
def information_dict(casename, anyou, chengxu, papertype, casetype, fayuan, wenshuleixing, anhao, panjueriqi, beigaoren, gongsuren):
info_dict_temp = {'案件名': casename, '案号':anhao, '案由':anyou, '程序': chengxu, '文书类型':papertype,
'案件类型': casetype, '法院':fayuan, '文书类型':wenshuleixing,
'判决日期': panjueriqi, '被告人':beigaoren, '公诉人':gongsuren
}
return info_dict_temp
# 将信息存入列表(内嵌套字典)
def information_list(casename, anyou, chengxu, papertype, anjianleixing,
fayuan, anhao, panjueriqi, beigaoren, gongsuren, law_ralated):
info_list = [
{'案件名': casename}, {'案由': anyou}, {'程序': chengxu},
{'文书类型': papertype}, {'案件类型': anjianleixing},
{'法院': fayuan}, {'案号':anhao},
{ '判决日期': panjueriqi}, {'被告人': beigaoren}, { '公诉人':gongsuren}, {'相关法律': law_ralated}
]
return info_list
# 字典转json文件(savefile_load是文件夹路径)
def dict2json(info_dict, savefile_load, casename):
b = json.dumps(info_dict, ensure_ascii=False)
savefile_load = savefile_load + '/' + casename + '.json'
f2 = open(savefile_load, 'w')
f2.write(b)
f2.close()
def list_output(file_path):
if os.path.splitext(file_path)[-1] == ".doc": # 判断文件类型是否为doc
print('传入文件格式为doc,将转换为docx格式。')
doc2docx_1(file_path)
file_path = file_path + 'x'
# 设置引用文件路径(因为后期可能要更新相应信息,所以采用此种方式更方便一些)
anyoutxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/anyou.txt'
leixingtxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/chengxu.txt'
papertypetxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/paper_type.txt'
casetypetxt = 'D:/projects_pycharm/lawdocs2excel/data_access/data/case_type.txt'
(casename, anyou, chengxu, papertype0, casetype0) = \
case_name(file_path, anyoutxt, leixingtxt, papertypetxt, casetypetxt)
paras = read_docx(file_path) # 读取文书段落
(papertype, casetype, fayuan, anhao, panjueriqi, beigaoren, gongsuren, law_ralated) \
= case_base_information(paras, papertype0, casetype0, papertypetxt, casetypetxt) # 读取基本信息
info_list = information_list(casename, anyou, chengxu, papertype, casetype, fayuan, anhao,
panjueriqi, beigaoren, gongsuren, law_ralated)
return info_list
if __name__ == '__main__':
path = 'E:/NLP/doc2docx_test/王云清应先勇等强迫交易罪非法占用农用地罪夏君兵王某29强迫交易罪一审刑事判决书.doc'
info_list = list_output(path)
for i in info_list:
print(i)
以下为文件内容举例。
anyou.txt
重大环境污染事故罪
污染环境罪
非法处置进口的固体废物罪
擅自进口固体废物罪
非法捕捞水产品罪
非法猎捕、杀害珍贵、濒危野生动物罪
非法收购、运输、出售珍贵、濒危野生动物、珍贵、濒危野生动物制品罪
非法狩猎罪
非法占用耕地罪
非法占用农用地罪
非法采矿罪
破坏性采矿罪
非法采伐、毁坏国家重点保护植物罪
非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品罪
非法采伐、毁坏珍贵树木罪
盗伐林木罪
非法收购、运输盗伐、滥伐的林木罪
非法收购盗伐、滥伐的林木罪
paper_type.txt
判决书
裁定书
调解书
决定书
通知书
令
代码补充:
# 实现单个doc文件转换为docx文件
def doc2docx_1(doc_path):
# 打开word并设置
word = wc.Dispatch("Word.Application")
word.Visible = 0 # 1程序可见,0不可见
print(doc_path)
doc = word.Documents.Open(doc_path)
docx_save_path = doc_path + 'x'
doc.SaveAs(docx_save_path, 12, False, "", True, "", False, False, False, False) # 转换后的文件,12代表转换后为docx文件
doc.Close()
word.Quit()