这次和大家分享一下主要使用正则表达式匹配文本信息内容的案例,其中还用到了jieba分词词性标注技术,和一些对文本的切片工作。有兴趣学习的可以详细看看,具体内容,应该有点帮助,这是本人一个一个代码敲出来的1000多行代码。
import os
import docx
import pickle
import time
import datetime
import re
import sys
import jieba.posseg as psg
import numpy as np
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
def _extract_result(wtjdxx,jdff_tj,jdqk,wbxx):
result_json = {}
WTJDXX = {}
ws_type = 'JYBG_JDS'
_get_bmsah(WTJDXX,wtjdxx)
_get_slrq(WTJDXX,wtjdxx)
_get_rwlydw(WTJDXX,wtjdxx)
_get_wtdwjb(WTJDXX,wtjdxx)
_get_sjr(WTJDXX,wtjdxx)
_get_aqzy(WTJDXX,wtjdxx)
_get_sjzy(WTJDXX,wtjdxx)
_get_wtyq(WTJDXX,wtjdxx)
_get_jdlb(WTJDXX,wtjdxx)
result_json["WTJDXX"] = WTJDXX
JDFF_TJ= {}
_get_sysbjs(JDFF_TJ,jdff_tj)
_get_sydbzgf(JDFF_TJ,jdff_tj)
_get_sydbzohhc(JDFF_TJ,jdff_tj)
result_json["JDFF_TJ"] = JDFF_TJ
JDQK = {}
_get_ksjdrq(JDQK,jdqk)
_get_bjrq(JDQK,jdqk)
_get_bljg(JDQK,jdqk)
_get_jdslkrq(JDQK,jdqk)
_get_wtrchrq(JDQK,jdqk)
_get_qtyyzzrq(JDQK,jdqk)
_get_dyjdr(JDQK,jdqk)
_get_dejdr(JDQK,jdqk)
_get_dsjdr(JDQK,jdqk)
_get_qtjdr(JDQK,jdqk)
_get_jybgfs_jdsfs(JDQK,jdqk)
_get_wpzsl(JDQK,jdqk)
result_json["JDQK"] = JDQK
return result_json
'''{
'WTJDXX': [{
'BMSAH': '',# 部门受案号
'SLRQ': '',#受理日期
'RWLYLB': '',#任务来源类别
'RWLYDW': '',#任务来源单位
'WTDWJB': '',#委托单位级别
'AJMC': '',#案件名称
'AY': '',#案由
'SJR': '',#送检人
'SJRLXDH': '',送检人联系电话
'SSJD': ,#诉讼阶段
'JDLB': '',#鉴定类别
'SJZY': '',#涉及专业
'ZTTJBS': '',# 专题统计标识
'AQZY': '',# 案情摘要
'WTYQ': ''# 委托要求
}],
'JDFF_TJ': [{
'SYSBJS': '',#使用设备技术
'SYDBZGF': '',# 使用的标准规范
'SYDBZPHHC': '',# 使用的标准品和耗材
}]``
'JDQK':[ {
'KSJDRQ': '',# 开始鉴定日期
'BJRQ': '',# 办结日期
'JDSLKRQ': '',# 鉴定书落款日期
'WTRCHRQ': '',# 委托人撤回日期
'QTYYZZRQ': '',# 其他原因终止日期
'BLJG': '', # 办理结果
'DYJDR': '',# 第一鉴定人
'DRJDR': '',# 第二鉴定人
'DSJDR': '',# 第三鉴定人
'QTCYR': '',# 其他参与人
'JYBGFS': '',#检验报告份数
'JDSFS': ''# 鉴定书份数
'WPZJSL': '',#外聘专家数量
}]
}'''
def extract_ner(slice_result):
if is_debug:
print('【委托鉴定信息】' + slice_result['委托鉴定信息'])
print('【鉴定条件与方法】' + slice_result['鉴定条件与方法'])
print('【鉴定情况】' + slice_result['鉴定情况'])
item = _extract_result(slice_result['委托鉴定信息'], slice_result['鉴定条件与方法'], slice_result['鉴定情况'],slice_result['全文检索'])
return item
def data_slice(wbxx):
slice_result = {}
wbxx= wbxx.replace(' ','')
content = wbxx
try:
wtjdxx = []
wtjdxx = re.findall('(?:检验报告).*?(?=(文本摘要|检验过程))',content)
if not wtjdxx:
wtjdxx=re.findall('(检验报告\s+(\S{1,30}?号)).*?(?=资料摘要)',content)
if not wtjdxx:
wtjdxx = re.findall('(鉴定书\s+(\S{1,30}?号)).*?(?=(文本摘要|资料摘要))',content)
if not wtjdxx:
wtjdxx = re.findall('鉴定书\s+(\S{1,30}?号).*?(?=[一二三四五六七八九十]?、检验:',content)
if not wtjdxx:
wtjdxx = content
slice_result['委托鉴定信息'] =wtjdxx
wtjdxx=wtjdxx.replace(' ','')
jdff_tj = []
jdff_tj = wtjdxx.replace(' ', '')
jdff_tj=re.findall('(?:检验过程:).*?(?=[一二三四五六七八九十]?、检验结果)', content)
if not jdff_tj:
jdff_tj = content
slice_result['鉴定条件与方法'] = jdff_tj
jdqk = []
jdqk = content
slice_result['鉴定情况'] = jdqk
jdqk = jdqk.replace(' ','')
slice_result["全文检索"] = wbxx
wbxx = wbxx.replace(' ','')
except Exception as e:
if is_debug:
print('data_slice 切片异常' + e)
return slice_result
def main(wbxx):
t1 = datetime.datetime.now().microsecond
t3 = time.mktime(datetime.datetime.now().timetuple())
slice_result = data_slice(wbxx)
extract_result = extract_ner(slice_result)
t2 = datetime.datetime.now().microsecond
t4 = time.mktime(datetime.datetime.now().timetuple())
if is_debug:
print('NLP解析结果耗时:%dms' % ((t4 - t3) * 1000 + (t2 - t1) / 1000))
return extract_result
def _get_bmsah(WTJDXX,wtjdxx):
try:
WTJDXX['BMSAH'] = ''.join(re.findall('(?<=检验报告).*?(\S{1,30}?号)', wtjdxx, re.S))
if not WTJDXX['BMSAH']:
WTJDXX['BMSAH'] = ''.join(re.findall('(?<=检验鉴定文书).*?(\S{1,30}?号)', wtjdxx, re.S))
except Exception as e:
if is_debug:
print('获取部门受案号异常:' + e.__str__())
WTJDXX['BMSAH']= ''
def _get_slrq(WTJDXX,wtjdxx):
try:
wtjdxx=wtjdxx.replace(' ','')
WTJDXX['SLRQ'] =''.join(re.findall('(?<=委托日期:)'+ ('\d{4}'+'年?.?\s?'+'\d{1,2}'+'月?.?\s?'+'\d{1,2}'+'日?.?\s'),wtjdxx))
if not WTJDXX['SLRQ']:
WTJDXX['SLRQ'] =''.join(re.findall('(?<=委托日期:)'+ ('\d{4}'+'你那?.?\s?'+'\d{1,2}'+'月?.?\s?'+'\d{1,2}'+'日?.?\s'),wtjdxx))
if not WTJDXX['SLRQ']:
WTJDXX['SLRQ'] = ''.join(re.findall('(?<=委托时间:)' + ('\d{4}' + '年?.?\s?' + '\d{1,2}' + '月?.?\s?' + '\d{1,2}' + '日?.?\s'), wtjdxx))
except Exception as e:
if is_debug:
print('获取受理日期异常:' + e.__str__())
WTJDXX['SLRQ']= ''
def _get_rwlydw(WTJDXX,wtjdxx):
try:
WTJDXX['RWLYDW'] = ''.join(re.findall('(?<=委托单位:).*?(?=委托日期)',wtjdxx, re.DOTALL))
if not WTJDXX['RWLYDW']:
WTJDXX['RWLYDW'] = ''.join(re.findall('(?<=委托单位:).*?(?=委托时间)', wtjdxx, re.DOTALL))
except Exception as e:
if is_debug:
print('获取任务来源单位异常:' + e.__str__())
WTJDXX['RWLYDW'] = ''
def _get_wtdwjb(WTJDXX,wtjdxx):
try:
WTDW = re.findall('(?<=委托单位:).*?(?=委托日期)',wtjdxx, re.DOTALL)
if not WTDW:
WTDW = re.findall('(?<=委托单位:).*?(?=委托时间)', wtjdxx, re.DOTALL)
if '县' or '区' in WTDW[0]:
WTJDXX['WTDWJB'] = '县区级院'
if '市' in WTDW[0]:
WTJDXX['WTDWJB'] = '地市级院'
if '省' in WTDW[0]:
WTJDXX['WTDWJB'] = '省级院'
if '最高检' in WTDW[0]:
WTJDXX['WTDWJB'] = '高检院'
except Exception as e:
if is_debug:
print('获取委托单位级别异常:' + e.__str__())
WTJDXX['WTDWJB']= ''
def _get_jdlb(WTJDXX,wtjdxx):
try:
wtjdxx=wtjdxx.replace(' ','').replace(' ','').replace(' ','')
if '补充鉴定' in wtjdxx:
WTJDXX['JDLB'] = '补充鉴定'
if '重新鉴定' in wtjdxx:
WTJDXX['JDLB'] = '重新鉴定'
else:
WTJDXX['JDLB'] = '首次鉴定'
except Exception as e:
if is_debug:
print('获取鉴定类别异常:' + e.__str__())
WTJDXX['JDLB'] = ''
def _get_sjr(WTJDXX,wtjdxx):
try:
wtjdxx = wtjdxx.replace(' ','').replace(' ','').replace(' ','')
WTJDXX['SJR'] = ''.join(re.findall('(?<=送检人:).*?(?=送检材料)', wtjdxx, re.DOTALL))
except Exception as e:
if is_debug:
print('获取送检人异常:' + e.__str__())
WTJDXX['SJR'] = ''
def _get_sjzy(WTJDXX,wtjdxx):
try:
wtjdxx=wtjdxx.replace(' ','').replace(' ','').replace(' ','').replace(' ','')
sjzyxx = re.compile('送检材料:(.*?)检验开始日期:',re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)检验开始时间:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)鉴定开始日期:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)鉴定开始时间:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)开始检验日期:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)开始检验时间:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)开始鉴定日期:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)开始鉴定时间:', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if not sjzynr:
sjzyxx = re.compile('送检材料:(.*?)检验:?', re.S)
sjzynr = sjzyxx.findall(wtjdxx)
if '尸体' in sjzynr[0] or '尸表' in sjzynr[0]:
WTJDXX['SJZY'] = '法医病理'
if '记账凭证' in sjzynr[0] or '凭证' in sjzynr[0] or '收入' in sjzynr[0] or '资金' in sjzynr[0] or '工资' in sjzynr[0] or '明账' in sjzynr[0] or '细账' in sjzynr[0] or '账页' in sjzynr[0] or '数额' in sjzynr[0] or '总额' in sjzynr[0] or '票据' in sjzynr[0] or '金额' in sjzynr[0] or '报销' in sjzynr[0] or '资金' in sjzynr[0] or '赃款' in sjzynr[0] or '贷款' in sjzynr[0] or '补偿款' in sjzynr[0] or '会计' in sjzynr[0]:
WTJDXX['SJZY'] = '司法会计'
if '伤情' in sjzynr[0] or '损伤' in sjzynr[0] or '法医' in sjzynr[0]:
WTJDXX['SJZY'] = '法医临床'
if '数据' in sjzynr[0] or '恢复' in sjzynr[0] or '提取' in sjzynr[0] or 'U盘' in sjzynr[0] or '微信' in sjzynr[0] or 'QQ' in sjzynr[0] or 'qq' in sjzynr[0] or '短信' in sjzynr[0]:
WTJDXX['SJZY'] = '电子数据'
if '精神病' in sjzynr[0]:
WTJDXX['SJZY'] = '法医精神病'
if '毒物' in sjzynr [0]:
WTJDXX['SJZY'] = '法医毒物'
if '塑料' in sjzynr[0] or '橡胶' in sjzynr[0] or '玻璃' in sjzynr[0]:
WTJDXX['SJZY'] = '微量物证'
except Exception as e:
if is_debug:
print('获取涉及专业异常:' + e.__str__())
WTJDXX['SJZY'] = ''
def _get_aqzy(WTJDXX,wtjdxx):
try:
wtjdxx = wtjdxx.replace(' ', '').replace(' ', '').replace(' ', '')
WTJDXX['AQZY'] =''.join(re.findall('(?<=案情摘要).*?(?=文本摘要)',wtjdxx,re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=文本摘要)', wtjdxx, re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=案情摘要).*?(?=[一二三四五六七八九十]?、?资料摘要)', wtjdxx, re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=[一二三四五六七八九十]?、?资料摘要)', wtjdxx, re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=[一二三四五六七八九十]?、?鉴定过程)', wtjdxx, re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=案情摘要).*?(?=[一二三四五六七八九十]?、?鉴定过程)', wtjdxx, re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=摘要案情).*?(?=[一二三四五六七八九十]?、?检验)',wtjdxx,re.DOTALL))
if not WTJDXX['AQZY']:
WTJDXX['AQZY'] = ''.join(re.findall('(?<=案情摘要).*?(?=[一二三四五六七八九十]?、?检验)',wtjdxx,re.DOTALL))
except Exception as e:
if is_debug:
print('获取案情摘要异常:' + e.__str__())
WTJDXX['AQZY'] = ''
def _get_wtyq(WTJDXX,wtjdxx):
try:
wtjdxx=wtjdxx.replace(' ','').replace(' ','').replace(' ','')
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验开始日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验开始时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=鉴定开始时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=鉴定开始日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始鉴定时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始鉴定日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始检验时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=开始检验日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验过程)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=检验)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=委托要求:).*?(?=[一二三四五六七八九十]?、受理日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验开始日期)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验开始时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=鉴定开始时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=鉴定开始日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始鉴定时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始鉴定日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始检验时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=开始检验日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验过程)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=检验)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=鉴定要求:).*?(?=[一二三四五六七八九十]?、受理日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验开始日期)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验开始时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=鉴定开始时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=鉴定开始日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始鉴定时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始鉴定日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始检验时间)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=开始检验日期)',wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验过程)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=检验)', wtjdxx, re.DOTALL))
if not WTJDXX['WTYQ']:
WTJDXX['WTYQ'] = ''.join(re.findall('(?<=检验要求:).*?(?=[一二三四五六七八九十]?、受理日期)',wtjdxx, re.DOTALL))
except Exception as e:
if is_debug:
print('获取委托要求异常:' + e.__str__())
WTJDXX['WTYQ'] = ''
def _get_sysbjs(JDFF_TJ,jdff_tj):
try:
jdff_tj = jdff_tj.replace(' ', '').replace(' ', '').replace(' ', '')
JDFF_TJ['SYSBJS'] = ''.join(re.findall('(?<=1、检验设备).*?(?=2、检验方法)',jdff_tj,re.DOTALL))
if not JDFF_TJ['SYSBJS']:
JDFF_TJ['SYSBJS'] = ''.join(re.findall('(?<=(一)检验设备).*?(?=(二)检验方法)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYSBJS']:
JDFF_TJ['SYSBJS'] = ''.join(re.findall('(?<=检验设备:).*?(?=检验软件)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYSBJS']:
JDFF_TJ['SYSBJS'] =''.join(re.findall('(?<=检验过程).*?(?=[一二三四五六七八九十]?、?检验结果)',jdff_tj,re.DOTALL))
except Exception as e:
if is_debug:
print('获取使用设备技术异常:' + e.__str__())
JDFF_TJ['SYSBJS']=''
def _get_sydbzgf(JDFF_TJ,jdff_tj):
try:
jdff_tj = jdff_tj.replace(' ', '').replace(' ', '').replace(' ', '')
JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[一二三四五六七八九十]?、?检验步骤)',jdff_tj,re.DOTALL))
if not JDFF_TJ['SYDBZGF']:
JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[123456789]?、?检验步骤)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYDBZGF']:
JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[123456789]?、?对送检检材)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYDBZGF']:
JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验方法).*?(?=[一二三四五六七八九十]?、?鉴定意见)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYDBZGF']:
JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验软件:).*?(?=检验过程)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYDBZGF']:
JDFF_TJ['SYDBZGF'] = ''.join(re.findall('(?<=检验过程).*?(?=[一二三四五六七八九十]?、检验结果)', jdff_tj,re.DOTALL))
except Exception as e:
if is_debug:
print('获取使用的标准规范异常:' + e.__str__())
JDFF_TJ['SYDBZGF']=''
def _get_sydbzohhc(JDFF_TJ,jdff_tj):
try:
jdff_tj = jdff_tj.replace(' ', '').replace(' ', '').replace(' ', '')
JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验软件).*?(?=[一二三四五六七八九十]?、?检验过程)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYDBZPHHC']:
JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验软件).*?(?=[一二三四五六七八九十]?、?检验方法)', jdff_tj, re.DOTALL))
if not JDFF_TJ['SYDBZPHHC']:
JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验过程).*?(?=[一二三四五六七八九十]?、?检验结果)', jdff_tj,re.DOTALL))
if not JDFF_TJ['SYDBZPHHC'] :
JDFF_TJ['SYDBZPHHC'] = ''.join(re.findall('(?<=检验步骤).*?(?=[一二三四五六七八九十]?、?检验结果)',jdff_tj,re.DOTALL))
except Exception as e:
if is_debug:
print('获取使用的标准品和耗材异常:' + e.__str__())
JDFF_TJ['SYDBZPHHC'] =''
def _get_ksjdrq(JDQK,jdqk):
try:
jdqk=jdqk.replace(' ','').replace(' ','').replace(' ','')
JDQK['KSJDRQ'] = ''.join(re.findall('(?<=开始鉴定日期:)'+('\d{4}\s?年\d{1,2}\s?月\d{1,2}\s?日'),jdqk))
if not JDQK['KSJDRQ']:
JDQK['KSJDRQ']=''.join(re.findall('(?<=开始检验日期:)'+('\d{4}\s?年\d{1,2}\s?月\d{1,2}\s?日'),jdqk))
if not JDQK['KSJDRQ']:
JDQK['KSJDRQ']=''.join(re.findall('(?<=鉴定开始日期:)'+('\d{4}年\s?\d{1,2}\s?月\d{1,2}\s?日'),jdqk))
if not JDQK['KSJDRQ']:
JDQK['KSJDRQ']=''.join(re.findall('(?<=检验开始日期:)'+('\d{4}\s?年\d{1,2}\s?月\d{1,2}\s?日'),jdqk))
except Exception as e:
if is_debug:
print("获取开始鉴定日期异常:" + e.__str__())
JDQK['KSJDRQ'] = ''
def _get_bjrq(JDQK,jdqk):
try:
jdqk = jdqk.replace(' ', '').replace(' ', '').replace(' ', '')
JDQK['BJRQ']=''.join(re.findall('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日',jdqk)[-1])
except Exception as e:
if is_debug:
print("获取办理日期异常:" + e.__str__())
JDQK['BJRQ'] = ''
def _get_jdslkrq(JDQK,jdqk):
try:
jdqk = jdqk.replace(' ', '').replace(' ', '').replace(' ', '')
JDQK['JDSLKRQ']=''.join(re.findall('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日',jdqk)[-1])
except Exception as e:
if is_debug:
print("获取鉴定书落款日期异常:" + e.__str__())
JDQK['JDSLKRQ'] = ''
def _get_wtrchrq(JDQK,jdqk):
try:
jdqk = jdqk.replace(' ', '').replace(' ', '').replace(' ', '')
JDQK['WTRCHRQ']=''.join(re.findall('(?<=委托人撤回日期:)'+('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日'),jdqk))
except Exception as e:
if is_debug:
print('获取委托人撤回日期异常:' + e.__str__())
JDQK['WTRCHRQ'] = ''
def _get_qtyyzzrq(JDQK,jdqk):
try:
jdqk = jdqk.replace(' ', '').replace(' ', '').replace(' ', '')
JDQK['WTRCHRQ']=''.join(re.findall('(?<=其他原因终止日期:)'+('\d{4}'+'年'+'\d{1,2}'+'月'+'\d{1,2}'+'日'),jdqk))
except Exception as e:
if is_debug:
print('获取其他原因终止日期异常:' + e.__str__())
JDQK['WTRCHRQ'] = ''
def _get_bljg(JDQK,jdqk):
try:
JDQK['BLJG'] = re.findall('(?<=检验结果:).*?(?=附件)', jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结果).*?(?=附件)', jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验意见:).*?(?=附件)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验意见).*?(?=附件)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结论:).*?(?=附件)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结论).*?(?=附件)', jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结果).*?(?=检验人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结果:).*?(?=检验人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结果:).*?(?=鉴定人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结果).*?(?=鉴定人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结论).*?(?=检验人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结论:).*?(?=检验人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结论:).*?(?=鉴定人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验结论).*?(?=鉴定人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验意见:).*?(?=鉴定人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验意见).*?(?=鉴定人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验意见:).*?(?=检验人)',jdqk, re.DOTALL)
if not JDQK['BLJG']:
JDQK['BLJG'] = re.findall('(?<=检验意见).*?(?=检验人】)',jdqk, re.DOTALL)
JDQK['BLJG']=''.join(JDQK['BLJG'])
except Exception as e:
if is_debug:
print('获取办理结果异常:' + e.__str__())
JDQK['BLJG']=''
def _get_dyjdr(JDQK,jdqk):
try:
jdqk=jdqk.replace(' ','').replace(' ','').replace(' ','')
JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)',jdqk,re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)',jdqk,re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)',jdqk,re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九十]{4}年[[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)',jdqk,re.DOTALL)
JDR_str = ''.join(JDR_list)
result = psg.cut(JDR_str)
JDR = [x.word for x in result if x.flag=='nr']
JDR_CD = len(JDR)
if JDR_CD >=1:
JDQK['DYJDR'] = JDR[0]
else:
JDQK['DYJDR'] = ''
except Exception as e:
if is_debug:
print('获取第一鉴定人异常:' + e.__str__())
JDQK['DYJDR']=''
def _get_dejdr(JDQK,jdqk):
try:
jdqk=jdqk.replace(' ','').replace(' ','').replace(' ','')
JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)
JDR_str = ''.join(JDR_list)
result = psg.cut(JDR_str)
JDR = [x.word for x in result if x.flag == 'nr']
JDR_CD = len(JDR)
if JDR_CD >= 2:
JDQK['DRJDR'] = JDR[1]
elif JDR_CD < 2:
JDQK['DRJDR'] = ''
except Exception as e:
if is_debug:
print('获取第二鉴定人异常:' + e.__str__())
JDQK['DRJDR']=''
def _get_dsjdr(JDQK,jdqk):
try:
jdqk=jdqk.replace(' ','').replace(' ','').replace(' ','')
JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)
JDR_str = ''.join(JDR_list)
result = psg.cut(JDR_str)
JDR = [x.word for x in result if x.flag == 'nr']
JDR_CD = len(JDR)
if JDR_CD >= 3:
JDQK['DSJDR'] = JDR[2]
else:
JDQK['DSJDR'] = ''
except Exception as e:
if is_debug:
print('获取第三鉴定人异常:' + e.__str__())
JDQK['DSJDR']=''
def _get_qtjdr(JDQK,jdqk):
try:
jdqk=jdqk.replace(' ','').replace(' ','').replace(' ','')
JDR_list = re.findall('(?<=鉴定人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=授权签字人)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=鉴定人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=\d{4}''年''\d{1,2}''月''\d{1,2}''日)', jdqk, re.DOTALL)
if not JDR_list:
JDR_list = re.findall('(?<=检验人:).*?(?=[零0○〇一二三四五六七八九]{4}年[零0○〇一二三四五六七八九十]{1,2}月[零0○〇一二三四五六七八九十]{1,2}日)', jdqk,re.DOTALL)
JDR_str = ''.join(JDR_list)
result = psg.cut(JDR_str)
JDR = [x.word for x in result if x.flag == 'nr']
JDR_CD = len(JDR)
if JDR_CD >= 4:
JDQK['QTCYR'] = JDR[3:]
else:
JDQK['QTCYR'] = ''
except Exception as e:
if is_debug:
print('获取其他参与人异常:' + e.__str__())
JDQK['QTCYR']=''
def _get_jybgfs_jdsfs(JDQK,jdqk):
try:
JYBG = re.findall('(?:检验报告).*?(?=委托单位)',jdqk,re.S)
if '检验报告' in JYBG[0]:
JDQK['JYBGFS'] = 1
JDQK['JDSFS'] = 0
if '鉴定书' in JYBG[0]:
JDQK['JYBGFS'] = 0
JDQK['JDSFS'] = 1
except Exception as e:
if is_debug:
print('获取检验报告和鉴定书异常:' + e.__str__())
JDQK['JYBGFS']=''
JDQK['JDSFS']=''
def _get_wpzsl(JDQK,jdqk):
try:
WPZJ = re.findall('(?=外聘专家数量)',jdqk)
if WPZJ != []:
pass
else:
JDQK['WPZJSL'] = 0
except Exception as e:
if is_debug:
print('获取外聘专家数量异常:' + e.__str__())
JDQK['WPZJSL']=''
def readoc(path, client=None):
suffix = os.path.splitext(path)[-1]
wbxx = ''
if suffix == '.txt':
try:
with open(path, 'r', encoding='utf-8') as f:
wbxx = f.read()
except:
try:
with open(path, 'r', encoding='gb2312') as f:
wbxx = f.read()
except:
try:
with open(path, 'r', encoding='gbk') as f:
wbxx = f.read()
except:
with open(path, 'r', encoding='gb18030') as f:
wbxx = f.read()
elif suffix == '.docx':
document = docx.Document(path)
for paragraph in document.paragraphs:
wbxx += paragraph.text
wbxx += '\n'
elif suffix in ['.doc', '.docx', '.htm', '.rtf', '.wps']:
word = client.Dispatch('Word.Application')
word.visible = 0
word.displayalerts = 0
try:
doc = word.Documents.Open(path)
i = 0
for paragraphs in doc.Paragraphs:
i += 1
wbxx += paragraphs.Range.Text.replace('\r', '\n')
doc.Close()
except:
try:
doc.Close()
except:
pass
return 'word读取出错'
word.Quit()
else:
return '未识别格式'
return wbxx
def read_sys(func):
global is_debug
is_debug =False
isfiledir = False
f_path = r''
if len(sys.argv) == 2:
f_path = sys.argv[-1]
else:
f_path = r'D:\wb文案测试\检察技术NLP\技术检察检验报告\胶检技鉴受[2019]37028100002号_检验报告_952B014000A60038E053C0A8014875AE.doc'
if is_debug:
print(f_path)
isfiledir = False
pass
if isfiledir:
i = 0
for f in os.scandir(f_path):
if is_debug:
if i < 1:
i = i + 1
continue
f_path = f.path
if is_debug:
print(f_path)
if os.path.isfile(r'' + f_path.replace('\"', '')):
if sys.platform != 'linux' and os.path.splitext(f_path)[-1] != '.txt':
from win32com import client
else:
client = None
content = readoc(os.path.normpath(r'' + f_path.replace('\"', '')), client)
elif isinstance(content, str):
pass
else:
print('未识别文件格式!!!')
if len(sys.argv) == 1:
pass
extract_result = func(content)
print(extract_result)
i = i + 1
if is_debug:
if i == 200:
break
else:
if os.path.isfile(r''+f_path.replace('\"', '')):
if sys.platform != 'linux' and os.path.splitext(f_path)[-1] != '.txt':
from win32com import client
else:
client = None
content = readoc(os.path.normpath(r''+f_path.replace('\"', '')), client)
elif isinstance(f_path, str):
pass
else:
print('未识别文件格式!!!')
if len(sys.argv) == 1:
pass
extract_result = func(content)
print(extract_result)
if __name__ == '__main__':
read_sys(main)
*斜体样式*
在获取姓名的时候用到jieba分词和词性标注进行获取
多文本处理的也有很多的内容,在这里也说不完,可以具体在代码中看,欢迎大神们指导,本人随时更新和修改。