人行征信-个人版信息结构化

# -*- coding: utf-8 -*-
from lxml.html import parse

from urllib.request import urlopen


def personal_reporting_to_json(html_file):
    parsed = parse(urlopen(html_file))
    doc = parsed.getroot()

    # htmlf = open(html_file, 'r', encoding='utf-8')
    # html = htmlf.read()
    # doc = etree.HTML(html)

    # 找到html中有<table></table>的所有table,以列表的形式返回给tables
    tables = doc.findall('.//table')
    # print("表的总数=", len(tables))
    res_dict = {}
    gerenxinxi_dict = {}
    peiouxinxi_dict = {}
    yuqitouzixinxihuizong_dict = {}  ##逾期(透支)信息汇总
    beizhuichanghuizong_dict = {}  ##被追偿信息汇总
    daizhangxinxihuizong_dict = {}  ##呆账信息汇总
    xdjysx_dict = {}  # (二)信贷交易授信及负债信息概要存储的数据
    chaxun_list = []
    ##非循环贷账户
    zhanghu_list_1 = []
    zhanghu_list_2 = []
    zhanghu_list_3 = []
    ##循环额度下分账户
    xunhuaneduzhanghu_list_1 = []
    xunhuaneduzhanghu_list_2 = []
    xunhuaneduzhanghu_list_3 = []
    ##循环贷账户
    xunhuandaizhanghu_list_1 = []
    xunhuandaizhanghu_list_2 = []
    xunhuandaizhanghu_list_3 = []
    ##贷记卡账户
    daijika_list_1 = []
    daijika_list_2 = []
    daijika_list_3 = []

    ##准贷记卡账户
    zhundaijika_list_1 = []
    zhundaijika_list_2 = []
    zhundaijika_list_3 = []
    ##被追偿账户
    beizhuichang_zhanghu_list_1 = []
    beizhuichang_zhanghu_list_2 = []

    huankuanjilu_list = []
    feixuanhuankuan_list = []
    zhufanggongjijin_list = []
    ##(四)相关还款责任信息
    huankuan_zeren_qiye_list = []
    huankuan_zeren_qiye_list_2 = []
    table_num = 1
    ##非循环贷账户数量
    zhanghu_num = 1
    zhanghu_num_2 = 1
    zhanghu_num_3 = 1
    ##循环额度下分账户数量
    xunhuaneduzhanghu_num = 1
    xunhuaneduzhanghu_num_2 = 1
    xunhuaneduzhanghu_num_3 = 1
    ##循环贷账户数量
    xunhuandaizhanghu_num = 1
    xunhuandaizhanghu_num_2 = 1
    xunhuandaizhanghu_num_3 = 1
    ##贷记卡账户数量
    daiji_ka_num = 1
    daiji_ka_num_2 = 1
    daiji_ka_num_3 = 1

    ##准贷记卡账户数量
    zhundaiji_ka_num = 1
    zhundaiji_ka_num_2 = 1
    zhundaiji_ka_num_3 = 1
    ##被追偿信息账户数量
    beizhuichang_zhanghu_num = 1
    beizhuichang_zhanghu_num_2 = 1
    ##(四)相关还款责任信息账户数量
    huankuan_zeren_qiye_num = 1

    feixunhuan_total_zhanghu = 0
    daiji_total_num = 0

    # tables = tables[0:21]

    for table in tables:
        content = table.text_content()
        trs = table.findall('.//tr')  #
        # print("trs的个数=", len(trs))
        # print("tr数据=", trs[0].text_content())
        first_row_content = trs[0].text_content()
        # print("第一行数据=", first_row_content)
        if "管理机构" in first_row_content and "账户标识" in first_row_content and "开立日期" in first_row_content:
            feixunhuan_total_zhanghu += 1
        if "发卡机构" in first_row_content and "账户标识" in first_row_content and "共享授信额度" in first_row_content:
            daiji_total_num += 1
    # print(feixunhuan_total_zhanghu)
    # print(daiji_total_num)

    for table in tables:
        # 我们要的是第一个table
        content = table.text_content()
        trs = table.findall('.//tr')  #
        # print("trs的个数=", len(trs))
        # print("tr数据=",trs[0].text_content())
        first_row_content = trs[0].text_content()
        # print("第一行数据=", first_row_content)
        # 个人征信中的被查询信息的表格数据提取
        if "报告编号:" in first_row_content:
            baogao_dict = {}
            # print(type(first_row_content))
            first_tds = trs[0].findall('.//td')
            # print("first_tds=",first_tds)
            for first_td in first_tds:
                baogao_bianhao = first_td.text_content()
                key = str(baogao_bianhao).split(":")[0]
                # print('key===', key)
                key = str(key).replace("\r", "").replace("\n", "").replace("\t", "")
                value = str(baogao_bianhao).split(":")[1].replace(" ", " ")
                baogao_dict[key] = value

            chaxunxinxi_kes_ths = trs[1].findall('.//th')
            chaxunxinxi_values_tds = trs[2].findall('.//td')
            i = 0
            for chaxunxinxi_kes_th in chaxunxinxi_kes_ths:
                # key = chaxunxinxi_kes_th.text_content()

                key = chaxunxinxi_kes_th.text_content()

                value = chaxunxinxi_values_tds[i].text_content()
                baogao_dict[key] = value
                i += 1
            res_dict["被查询信息"] = baogao_dict
            # print("被查询信息提取=", baogao_dict)

        if "性别" in first_row_content or "通讯地址" in first_row_content:

            gerenxinxi_kes_ths = trs[0].findall('.//th')
            gerenxinxi_values_tds = trs[1].findall('.//td')
            i = 0
            for gerenxinxi_kes_th in gerenxinxi_kes_ths:
                key = gerenxinxi_kes_th.text_content()
                value = gerenxinxi_values_tds[i].text_content()
                gerenxinxi_dict[key] = value
                i += 1
            # print("个人信息=", gerenxinxi_dict)
            res_dict["个人基本信息"] = gerenxinxi_dict

        if "姓名" in first_row_content and "证件类型" in first_row_content and "证件号码" in first_row_content:

            peiouxinxi_kes_ths = trs[0].findall('.//th')
            peiouxinxi_values_tds = trs[1].findall('.//td')
            i = 0
            for peiouxinxi_kes_th in peiouxinxi_kes_ths:
                key = peiouxinxi_kes_th.text_content()
                value = peiouxinxi_values_tds[i].text_content()
                peiouxinxi_dict[key] = value
                i += 1
            # print("个人信息=", gerenxinxi_dict)
            res_dict["配偶信息"] = peiouxinxi_dict

        if "编号" in first_row_content:

            if "手机号码" in first_row_content:
                # print(len(trs))

                shouji_xinxi_list = []
                gerenxinxi_kes_ths = trs[0].findall('.//th')
                for tr in trs[1:]:
                    shouji_dict = {}
                    gerenxinxi_values_tds = tr.findall('.//td')
                    i = 0
                    for value_text in gerenxinxi_values_tds:
                        value = value_text.text_content()
                        # print(gerenxinxi_kes_ths[i].text_content())
                        # print(value)
                        shouji_dict[gerenxinxi_kes_ths[i].text_content()] = value
                        i += 1
                    shouji_xinxi_list.append(shouji_dict)
                gerenxinxi_dict["手机信息"] = shouji_xinxi_list
                res_dict["个人基本信息"] = gerenxinxi_dict
                # res_dict["个人基本信息"] = shouji_xinxi_list
            if "居住地址" in first_row_content and "住宅电话" in first_row_content and "居住状况" in first_row_content:
                juzhuxinxi_kes_ths = trs[0].findall('.//th')
                juzhu_list = []
                for tr in trs[1:]:
                    juzhuxinxi_dict = {}
                    juzhuxinxi_values_tds = tr.findall('.//td')
                    i = 0
                    for value_text in juzhuxinxi_values_tds:
                        value = value_text.text_content()
                        key = juzhuxinxi_kes_ths[i].text_content()
                        # print(juzhuxinxi_kes_ths[i].text_content())
                        # print(value)
                        juzhuxinxi_dict[key] = value
                        i += 1
                    juzhu_list.append(juzhuxinxi_dict)
                # print("居住信息=", juzhu_list)
                res_dict["居住信息"] = juzhu_list
            if "工作单位" in first_row_content and "单位性质" in first_row_content and "单位地址" in first_row_content:
                zhiyexinxi_kes_ths = trs[0].findall('.//th')
                zhiye_list = []
                for tr in trs[1:]:
                    zhiyexinxi_dict = {}
                    zhiyexinxi_values_tds = tr.findall('.//td')
                    i = 0
                    for value_text in zhiyexinxi_values_tds:
                        value = value_text.text_content()
                        key = zhiyexinxi_kes_ths[i].text_content()
                        # print(zhiyexinxi_kes_ths[i].text_content())
                        # print(value)
                        zhiyexinxi_dict[key] = value
                        i += 1
                    zhiye_list.append(zhiyexinxi_dict)
                res_dict["职业信息"] = zhiye_list
            if "职业" in first_row_content and "行业" in first_row_content and "职务" in first_row_content:
                zhiyexinxi2_kes_ths = trs[0].findall('.//th')
                zhiye2_list = []
                for tr in trs[1:]:
                    zhiyexinxi2_dict = {}
                    zhiyexinxi2_values_tds = tr.findall('.//td')
                    i = 0
                    for value_text in zhiyexinxi2_values_tds:
                        value = value_text.text_content()
                        key = zhiyexinxi2_kes_ths[i].text_content()
                        # print(zhiyexinxi_kes_ths[i].text_content())
                        # print(value)
                        zhiyexinxi2_dict[key] = value
                        i += 1
                    zhiye2_list.append(zhiyexinxi2_dict)
                res_dict["职业信息-2"] = zhiye2_list
        if "业务类型" in first_row_content and "账户数" in first_row_content and "首笔业务发放月份" in first_row_content:
            trs_rows = trs[1:]
            # i = 0
            dkjyxinxi_dict = {}
            daikuan_list = []
            xinyongka_list = []
            other_list = []
            total_list = []
            dict_3 = {}
            for trs_row in trs_rows:
                dkjyxinxi2_values_tds = trs_row.findall('.//td')
                dkjyxinxi2__kes_ths = trs_row.findall('.//th')
                key = dkjyxinxi2__kes_ths[0].text_content()
                # print('key=', key)

                dict_1 = {}
                i = 0
                for value_text in dkjyxinxi2_values_tds:
                    value = value_text.text_content()
                    if i == 0:
                        dict_1["账户数"] = value
                    else:
                        dict_1["首笔业务发放月份"] = value
                    i += 1
                # print('dict_1=', dict_1)
                # print("key = ", key)
                dict_2 = {}
                if "贷款" == key:
                    dict_2["个人住房贷款"] = dict_1
                    daikuan_list.append(dict_2)
                    continue
                elif "个人商用房贷款(包括商住两用房)" == key:
                    dict_2["个人商用房贷款(包括商住两用房)"] = dict_1
                    daikuan_list.append(dict_2)
                    continue
                elif "其他类贷款" == key:
                    dict_2["其他类贷款"] = dict_1
                    daikuan_list.append(dict_2)
                    continue
                elif "信用卡" == key:
                    dict_2["贷记卡"] = dict_1
                    xinyongka_list.append(dict_2)
                    continue
                elif "准贷记卡" == key:
                    dict_2["准贷记卡"] = dict_1
                    xinyongka_list.append(dict_2)
                    continue
                elif "其他" == key:
                    dict_2["--"] = dict_1
                    other_list.append(dict_2)
                    continue
                elif "合计" == key:
                    # dict_2["合计"] = dict_1
                    total_list.append(dict_1)
                    continue
            dkjyxinxi_dict["贷款"] = daikuan_list
            dkjyxinxi_dict["信用卡"] = xinyongka_list
            dkjyxinxi_dict["其他"] = other_list
            dkjyxinxi_dict["合计"] = total_list
            # print("dkjyxinxi_dict=", dkjyxinxi_dict)
            dict_3["业务类型"] = dkjyxinxi_dict
            res_dict["信贷交易信息提示"] = dict_3

        if "被追偿信息汇总" in first_row_content:
            trs_rows = trs[2:]
            beizhuichanghuizong_list = []
            for trs_row in trs_rows:

                yuqitouzi_values_tds = trs_row.findall('.//td')
                yuqitouzi__kes_ths = trs_row.findall('.//th')
                key = yuqitouzi__kes_ths[0].text_content()
                dict_1 = {}
                i = 0
                for value_text in yuqitouzi_values_tds:
                    value = value_text.text_content()
                    # print(value)
                    if i == 0:
                        dict_1["账户数"] = value
                    elif i == 1:
                        dict_1["余额"] = value

                    i += 1
                dict_1['业务类型'] = key
                beizhuichanghuizong_list.append(dict_1)
                # print('dict_1=', dict_1)
                # print("key = ", key)
            beizhuichanghuizong_dict['被追偿信息汇总'] = beizhuichanghuizong_list

        if "呆账信息汇总" in first_row_content:
            daizhangxinxi_dict = {}
            daizhangxinxi_kes_ths = trs[1].findall('.//th')
            daizhangxinxi_values_tds = trs[2].findall('.//td')
            i = 0
            for daizhangxinxi_kes_th in daizhangxinxi_kes_ths:
                key = daizhangxinxi_kes_th.text_content()
                value = daizhangxinxi_values_tds[i].text_content()
                daizhangxinxi_dict[key] = value
                i += 1

            daizhangxinxihuizong_dict["呆账信息汇总"] = daizhangxinxi_dict

        if "逾期(透支)信息汇总" in first_row_content:
            trs_rows = trs[2:]
            yuqitouzi_list = []
            dict_3 = {}
            dict_2 = {}
            dict_4 = {}
            for trs_row in trs_rows:

                yuqitouzi_values_tds = trs_row.findall('.//td')
                yuqitouzi__kes_ths = trs_row.findall('.//th')
                key = yuqitouzi__kes_ths[0].text_content()
                dict_1 = {}
                i = 0
                for value_text in yuqitouzi_values_tds:
                    value = value_text.text_content()
                    # print(value)
                    if i == 0:
                        dict_1["账户数"] = value
                    elif i == 1:
                        dict_1["月份数"] = value
                    elif i == 2:
                        dict_1["单月最高逾期/透支总额"] = value
                    elif i == 3:
                        dict_1["最长逾期/透支月数"] = value
                    i += 1
                # print('dict_1=', dict_1)
                # print("key = ", key)
                yuqitouzixinxihuizong_dict[key] = dict_1
            # yuqitouzi_list.append(dict_2)
            # dict_3["账户类型"] = yuqitouzi_list
            #
            # dict_4["逾期(透支)信息汇总"] = dict_3
            daizhangxinxihuizong_dict = Merge_2(daizhangxinxihuizong_dict, yuqitouzixinxihuizong_dict)
            beizhuichanghuizong_dict = Merge_2(beizhuichanghuizong_dict, daizhangxinxihuizong_dict)
            res_dict["信贷交易违约信息概要"] = beizhuichanghuizong_dict

        if "非循环贷账户信息汇总" in first_row_content or "贷记卡账户信息汇总" in first_row_content or "循环额度下分账户信息汇总" in first_row_content or "循环贷账户信息汇总" in first_row_content:
            # (二)信贷交易授信及负债信息概要
            feixunhuan_dict = {}
            feixunhuan_dict_2 = {}

            feixunhuan_xinxi_kes_ths = trs[0].findall('.//th')
            # print(feixunhuan_xinxi_kes_ths[0].text_content())
            first_key = feixunhuan_xinxi_kes_ths[0].text_content()

            feixunhuan_xinxi_kes_ths = trs[1].findall('.//th')
            feixunhuan_xinxi_values_tds = trs[2].findall('.//td')
            if "非循环贷账户信息汇总" == first_key:
                i = 0
                for feixunhuan_xinxi_kes_th in feixunhuan_xinxi_kes_ths:
            
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值