人行征信-企业版信息结构化

虎中申

于 2024-09-10 17:04:04 发布

阅读量822

点赞数 24

分类专栏：个人总结文章标签： python 大数据数据结构算法

本文链接：https://blog.csdn.net/qq_34649040/article/details/142103701

版权

import re, json
import requests
import html as ht
from lxml import etree

from lxml.etree import tostring
from lxml.html import parse

from urllib.request import urlopen


def chr(liebiao):
    newList = []
    for i in range(len(liebiao)):
        t = liebiao[i].replace('\n', '').replace('\t', '').replace(' ', '').replace('\xa0', '').replace('\u3000', '')
        if t == '':
            continue
        else:
            newList.append(t)
    return newList


def legalperson_to_json_2(path):
    htmlf = open(path, 'r', encoding='utf8')
    html = htmlf.read()
    tree = etree.HTML(html)
    repbody = tree.xpath('//*[@class="m-repbody"]')
    firstPage = tree.xpath('//*[@class="m-repbody firstPage"]')
    allDict = {}
    left = chr(tree.xpath('//*[@class="f-floatleft f-txtleft"]/text()'))
    right = chr(tree.xpath('//*[@class="f-floatright f-txtleft"]/text()'))
    allDict['baogaobianhao'] = left[0]
    allDict['chaxunjigou'] = left[1]
    allDict['baogaoshijian'] = right[0]
    allDict['chaxunyuanyin'] = right[1]

    path = str(path).replace("\\", "/")
    file_path = "file:///" + path

    parsed = parse(urlopen(file_path))

    doc = parsed.getroot()
    tables = doc.findall('.//table')
    for table in tables:

        trs = table.findall('.//tr')  #

        shang_ = table.xpath('./preceding-sibling::div[@class="t1"][1]//text()')
        if shang_:
            if '身份标识' in shang_[0]:
                # print(len(trs))
                # print(trs[1])
                shefenbiaoshi_dict = {}
                for index in range(0, len(trs)):
                    shenfenbiaoshi_kes_ths = trs[index].findall('.//th')[0]
                    shenfenbiaoshi_values_tds = trs[index].findall('.//td')[0]
                    key = shenfenbiaoshi_kes_ths.text_content()
                    value = shenfenbiaoshi_values_tds.text_content()
                    shefenbiaoshi_dict[key] = value
                allDict['sfbs'] = shefenbiaoshi_dict

        break

    for i in repbody:
        each = i.xpath('div/text()')

        if '信息概要' in each:
            tableCount = i.xpath('table')
            wjqxdgy = []  # 未结清信贷及授信信息概要
            yjqxdgy = []  # 已结清信贷及授信信息概要
            for j in tableCount:
                # table1 = tostring(j, encoding='utf-8').decode('utf-8')
                # print(table1)
                trCount = len(j.xpath('tbody/tr'))
                # trCount = len(j.xpath('tr'))##夺秒少一个tbody标签
                tr1 = chr(j.xpath('tbody/tr[1]//text()'))
                if '首次有信贷交易的年份' in tr1 and '发生信贷交易的机构数' in tr1:
                    gaiyao1 = {}
                    tr2 = chr(j.xpath('tbody/tr[2]//text()'))
                    gaiyao1['首次有信贷交易的年份'] = tr2[0]
                    gaiyao1['发生信贷交易的机构数'] = tr2[1]
                    gaiyao1['当前有未结清信贷交易的机构数'] = tr2[2]
                    gaiyao1['首次有相关还款责任的年份'] = tr2[3]
                    allDict['gaiyao1'] = gaiyao1
                    continue
                if '借贷交易' in tr1 and '担保交易' in tr1:
                    gaiyao2 = {}
                    jdjy = {}
                    dbjy = {}
                    for a in range(2, trCount + 1):
                        trn = chr(j.xpath('tbody/tr[{}]//text()'.format(a)))
                        jdjy[trn[0]] = trn[1]
                        if len(trn) == 4:
                            dbjy[trn[2]] = trn[3]
                    gaiyao2['借贷交易'] = jdjy
                    gaiyao2['担保交易'] = dbjy
                    allDict['gaiyao2'] = gaiyao2
                    continue
                if '非信贷交易账户数' in tr1 and '欠税记录条数' in tr1:
                    gaiyao3 = {}
                    tr2 = chr(j.xpath('tbody/tr[2]//text()'))
                    gaiyao3['非信贷交易账户数'] = tr2[0]
                    gaiyao3['欠税记录条数'] = tr2[1]
                    gaiyao3['民事判决记录条数'] = t