# -*- coding: utf-8 -*- from lxml.html import parse from urllib.request import urlopen def personal_reporting_to_json(html_file): parsed = parse(urlopen(html_file)) doc = parsed.getroot() # htmlf = open(html_file, 'r', encoding='utf-8') # html = htmlf.read() # doc = etree.HTML(html) # 找到html中有<table></table>的所有table,以列表的形式返回给tables tables = doc.findall('.//table') # print("表的总数=", len(tables)) res_dict = {} gerenxinxi_dict = {} peiouxinxi_dict = {} yuqitouzixinxihuizong_dict = {} ##逾期(透支)信息汇总 beizhuichanghuizong_dict = {} ##被追偿信息汇总 daizhangxinxihuizong_dict = {} ##呆账信息汇总 xdjysx_dict = {} # (二)信贷交易授信及负债信息概要存储的数据 chaxun_list = [] ##非循环贷账户 zhanghu_list_1 = [] zhanghu_list_2 = [] zhanghu_list_3 = [] ##循环额度下分账户 xunhuaneduzhanghu_list_1 = [] xunhuaneduzhanghu_list_2 = [] xunhuaneduzhanghu_list_3 = [] ##循环贷账户 xunhuandaizhanghu_list_1 = [] xunhuandaizhanghu_list_2 = [] xunhuandaizhanghu_list_3 = [] ##贷记卡账户 daijika_list_1 = [] daijika_list_2 = [] daijika_list_3 = [] ##准贷记卡账户 zhundaijika_list_1 = [] zhundaijika_list_2 = [] zhundaijika_list_3 = [] ##被追偿账户 beizhuichang_zhanghu_list_1 = [] beizhuichang_zhanghu_list_2 = [] huankuanjilu_list = [] feixuanhuankuan_list = [] zhufanggongjijin_list = [] ##(四)相关还款责任信息 huankuan_zeren_qiye_list = [] huankuan_zeren_qiye_list_2 = [] table_num = 1 ##非循环贷账户数量 zhanghu_num = 1 zhanghu_num_2 = 1 zhanghu_num_3 = 1 ##循环额度下分账户数量 xunhuaneduzhanghu_num = 1 xunhuaneduzhanghu_num_2 = 1 xunhuaneduzhanghu_num_3 = 1 ##循环贷账户数量 xunhuandaizhanghu_num = 1 xunhuandaizhanghu_num_2 = 1 xunhuandaizhanghu_num_3 = 1 ##贷记卡账户数量 daiji_ka_num = 1 daiji_ka_num_2 = 1 daiji_ka_num_3 = 1 ##准贷记卡账户数量 zhundaiji_ka_num = 1 zhundaiji_ka_num_2 = 1 zhundaiji_ka_num_3 = 1 ##被追偿信息账户数量 beizhuichang_zhanghu_num = 1 beizhuichang_zhanghu_num_2 = 1 ##(四)相关还款责任信息账户数量 huankuan_zeren_qiye_num = 1 feixunhuan_total_zhanghu = 0 daiji_total_num = 0 # tables = tables[0:21] for table in tables: content = table.text_content() trs = table.findall('.//tr') # # print("trs的个数=", len(trs)) # print("tr数据=", trs[0].text_content()) first_row_content = trs[0].text_content() # print("第一行数据=", first_row_content) if "管理机构" in first_row_content and "账户标识" in first_row_content and "开立日期" in first_row_content: feixunhuan_total_zhanghu += 1 if "发卡机构" in first_row_content and "账户标识" in first_row_content and "共享授信额度" in first_row_content: daiji_total_num += 1 # print(feixunhuan_total_zhanghu) # print(daiji_total_num) for table in tables: # 我们要的是第一个table content = table.text_content() trs = table.findall('.//tr') # # print("trs的个数=", len(trs)) # print("tr数据=",trs[0].text_content()) first_row_content = trs[0].text_content() # print("第一行数据=", first_row_content) # 个人征信中的被查询信息的表格数据提取 if "报告编号:" in first_row_content: baogao_dict = {} # print(type(first_row_content)) first_tds = trs[0].findall('.//td') # print("first_tds=",first_tds) for first_td in first_tds: baogao_bianhao = first_td.text_content() key = str(baogao_bianhao).split(":")[0] # print('key===', key) key = str(key).replace("\r", "").replace("\n", "").replace("\t", "") value = str(baogao_bianhao).split(":")[1].replace(" ", " ") baogao_dict[key] = value chaxunxinxi_kes_ths = trs[1].findall('.//th') chaxunxinxi_values_tds = trs[2].findall('.//td') i = 0 for chaxunxinxi_kes_th in chaxunxinxi_kes_ths: # key = chaxunxinxi_kes_th.text_content() key = chaxunxinxi_kes_th.text_content() value = chaxunxinxi_values_tds[i].text_content() baogao_dict[key] = value i += 1 res_dict["被查询信息"] = baogao_dict # print("被查询信息提取=", baogao_dict) if "性别" in first_row_content or "通讯地址" in first_row_content: gerenxinxi_kes_ths = trs[0].findall('.//th') gerenxinxi_values_tds = trs[1].findall('.//td') i = 0 for gerenxinxi_kes_th in gerenxinxi_kes_ths: key = gerenxinxi_kes_th.text_content() value = gerenxinxi_values_tds[i].text_content() gerenxinxi_dict[key] = value i += 1 # print("个人信息=", gerenxinxi_dict) res_dict["个人基本信息"] = gerenxinxi_dict if "姓名" in first_row_content and "证件类型" in first_row_content and "证件号码" in first_row_content: peiouxinxi_kes_ths = trs[0].findall('.//th') peiouxinxi_values_tds = trs[1].findall('.//td') i = 0 for peiouxinxi_kes_th in peiouxinxi_kes_ths: key = peiouxinxi_kes_th.text_content() value = peiouxinxi_values_tds[i].text_content() peiouxinxi_dict[key] = value i += 1 # print("个人信息=", gerenxinxi_dict) res_dict["配偶信息"] = peiouxinxi_dict if "编号" in first_row_content: if "手机号码" in first_row_content: # print(len(trs)) shouji_xinxi_list = [] gerenxinxi_kes_ths = trs[0].findall('.//th') for tr in trs[1:]: shouji_dict = {} gerenxinxi_values_tds = tr.findall('.//td') i = 0 for value_text in gerenxinxi_values_tds: value = value_text.text_content() # print(gerenxinxi_kes_ths[i].text_content()) # print(value) shouji_dict[gerenxinxi_kes_ths[i].text_content()] = value i += 1 shouji_xinxi_list.append(shouji_dict) gerenxinxi_dict["手机信息"] = shouji_xinxi_list res_dict["个人基本信息"] = gerenxinxi_dict # res_dict["个人基本信息"] = shouji_xinxi_list if "居住地址" in first_row_content and "住宅电话" in first_row_content and "居住状况" in first_row_content: juzhuxinxi_kes_ths = trs[0].findall('.//th') juzhu_list = [] for tr in trs[1:]: juzhuxinxi_dict = {} juzhuxinxi_values_tds = tr.findall('.//td') i = 0 for value_text in juzhuxinxi_values_tds: value = value_text.text_content() key = juzhuxinxi_kes_ths[i].text_content() # print(juzhuxinxi_kes_ths[i].text_content()) # print(value) juzhuxinxi_dict[key] = value i += 1 juzhu_list.append(juzhuxinxi_dict) # print("居住信息=", juzhu_list) res_dict["居住信息"] = juzhu_list if "工作单位" in first_row_content and "单位性质" in first_row_content and "单位地址" in first_row_content: zhiyexinxi_kes_ths = trs[0].findall('.//th') zhiye_list = [] for tr in trs[1:]: zhiyexinxi_dict = {} zhiyexinxi_values_tds = tr.findall('.//td') i = 0 for value_text in zhiyexinxi_values_tds: value = value_text.text_content() key = zhiyexinxi_kes_ths[i].text_content() # print(zhiyexinxi_kes_ths[i].text_content()) # print(value) zhiyexinxi_dict[key] = value i += 1 zhiye_list.append(zhiyexinxi_dict) res_dict["职业信息"] = zhiye_list if "职业" in first_row_content and "行业" in first_row_content and "职务" in first_row_content: zhiyexinxi2_kes_ths = trs[0].findall('.//th') zhiye2_list = [] for tr in trs[1:]: zhiyexinxi2_dict = {} zhiyexinxi2_values_tds = tr.findall('.//td') i = 0 for value_text in zhiyexinxi2_values_tds: value = value_text.text_content() key = zhiyexinxi2_kes_ths[i].text_content() # print(zhiyexinxi_kes_ths[i].text_content()) # print(value) zhiyexinxi2_dict[key] = value i += 1 zhiye2_list.append(zhiyexinxi2_dict) res_dict["职业信息-2"] = zhiye2_list if "业务类型" in first_row_content and "账户数" in first_row_content and "首笔业务发放月份" in first_row_content: trs_rows = trs[1:] # i = 0 dkjyxinxi_dict = {} daikuan_list = [] xinyongka_list = [] other_list = [] total_list = [] dict_3 = {} for trs_row in trs_rows: dkjyxinxi2_values_tds = trs_row.findall('.//td') dkjyxinxi2__kes_ths = trs_row.findall('.//th') key = dkjyxinxi2__kes_ths[0].text_content() # print('key=', key) dict_1 = {} i = 0 for value_text in dkjyxinxi2_values_tds: value = value_text.text_content() if i == 0: dict_1["账户数"] = value else: dict_1["首笔业务发放月份"] = value i += 1 # print('dict_1=', dict_1) # print("key = ", key) dict_2 = {} if "贷款" == key: dict_2["个人住房贷款"] = dict_1 daikuan_list.append(dict_2) continue elif "个人商用房贷款(包括商住两用房)" == key: dict_2["个人商用房贷款(包括商住两用房)"] = dict_1 daikuan_list.append(dict_2) continue elif "其他类贷款" == key: dict_2["其他类贷款"] = dict_1 daikuan_list.append(dict_2) continue elif "信用卡" == key: dict_2["贷记卡"] = dict_1 xinyongka_list.append(dict_2) continue elif "准贷记卡" == key: dict_2["准贷记卡"] = dict_1 xinyongka_list.append(dict_2) continue elif "其他" == key: dict_2["--"] = dict_1 other_list.append(dict_2) continue elif "合计" == key: # dict_2["合计"] = dict_1 total_list.append(dict_1) continue dkjyxinxi_dict["贷款"] = daikuan_list dkjyxinxi_dict["信用卡"] = xinyongka_list dkjyxinxi_dict["其他"] = other_list dkjyxinxi_dict["合计"] = total_list # print("dkjyxinxi_dict=", dkjyxinxi_dict) dict_3["业务类型"] = dkjyxinxi_dict res_dict["信贷交易信息提示"] = dict_3 if "被追偿信息汇总" in first_row_content: trs_rows = trs[2:] beizhuichanghuizong_list = [] for trs_row in trs_rows: yuqitouzi_values_tds = trs_row.findall('.//td') yuqitouzi__kes_ths = trs_row.findall('.//th') key = yuqitouzi__kes_ths[0].text_content() dict_1 = {} i = 0 for value_text in yuqitouzi_values_tds: value = value_text.text_content() # print(value) if i == 0: dict_1["账户数"] = value elif i == 1: dict_1["余额"] = value i += 1 dict_1['业务类型'] = key beizhuichanghuizong_list.append(dict_1) # print('dict_1=', dict_1) # print("key = ", key) beizhuichanghuizong_dict['被追偿信息汇总'] = beizhuichanghuizong_list if "呆账信息汇总" in first_row_content: daizhangxinxi_dict = {} daizhangxinxi_kes_ths = trs[1].findall('.//th') daizhangxinxi_values_tds = trs[2].findall('.//td') i = 0 for daizhangxinxi_kes_th in daizhangxinxi_kes_ths: key = daizhangxinxi_kes_th.text_content() value = daizhangxinxi_values_tds[i].text_content() daizhangxinxi_dict[key] = value i += 1 daizhangxinxihuizong_dict["呆账信息汇总"] = daizhangxinxi_dict if "逾期(透支)信息汇总" in first_row_content: trs_rows = trs[2:] yuqitouzi_list = [] dict_3 = {} dict_2 = {} dict_4 = {} for trs_row in trs_rows: yuqitouzi_values_tds = trs_row.findall('.//td') yuqitouzi__kes_ths = trs_row.findall('.//th') key = yuqitouzi__kes_ths[0].text_content() dict_1 = {} i = 0 for value_text in yuqitouzi_values_tds: value = value_text.text_content() # print(value) if i == 0: dict_1["账户数"] = value elif i == 1: dict_1["月份数"] = value elif i == 2: dict_1["单月最高逾期/透支总额"] = value elif i == 3: dict_1["最长逾期/透支月数"] = value i += 1 # print('dict_1=', dict_1) # print("key = ", key) yuqitouzixinxihuizong_dict[key] = dict_1 # yuqitouzi_list.append(dict_2) # dict_3["账户类型"] = yuqitouzi_list # # dict_4["逾期(透支)信息汇总"] = dict_3 daizhangxinxihuizong_dict = Merge_2(daizhangxinxihuizong_dict, yuqitouzixinxihuizong_dict) beizhuichanghuizong_dict = Merge_2(beizhuichanghuizong_dict, daizhangxinxihuizong_dict) res_dict["信贷交易违约信息概要"] = beizhuichanghuizong_dict if "非循环贷账户信息汇总" in first_row_content or "贷记卡账户信息汇总" in first_row_content or "循环额度下分账户信息汇总" in first_row_content or "循环贷账户信息汇总" in first_row_content: # (二)信贷交易授信及负债信息概要 feixunhuan_dict = {} feixunhuan_dict_2 = {} feixunhuan_xinxi_kes_ths = trs[0].findall('.//th') # print(feixunhuan_xinxi_kes_ths[0].text_content()) first_key = feixunhuan_xinxi_kes_ths[0].text_content() feixunhuan_xinxi_kes_ths = trs[1].findall('.//th') feixunhuan_xinxi_values_tds = trs[2].findall('.//td') if "非循环贷账户信息汇总" == first_key: i = 0 for feixunhuan_xinxi_kes_th in feixunhuan_xinxi_kes_ths:
07-22
1472
01-21
1575
08-15
3934
04-25
5119
12-12
3376
09-01
1823
08-29
1726