import re, json import requests import html as ht from lxml import etree from lxml.etree import tostring from lxml.html import parse from urllib.request import urlopen def chr(liebiao): newList = [] for i in range(len(liebiao)): t = liebiao[i].replace('\n', '').replace('\t', '').replace(' ', '').replace('\xa0', '').replace('\u3000', '') if t == '': continue else: newList.append(t) return newList def legalperson_to_json_2(path): htmlf = open(path, 'r', encoding='utf8') html = htmlf.read() tree = etree.HTML(html) repbody = tree.xpath('//*[@class="m-repbody"]') firstPage = tree.xpath('//*[@class="m-repbody firstPage"]') allDict = {} left = chr(tree.xpath('//*[@class="f-floatleft f-txtleft"]/text()')) right = chr(tree.xpath('//*[@class="f-floatright f-txtleft"]/text()')) allDict['baogaobianhao'] = left[0] allDict['chaxunjigou'] = left[1] allDict['baogaoshijian'] = right[0] allDict['chaxunyuanyin'] = right[1] path = str(path).replace("\\", "/") file_path = "file:///" + path parsed = parse(urlopen(file_path)) doc = parsed.getroot() tables = doc.findall('.//table') for table in tables: trs = table.findall('.//tr') # shang_ = table.xpath('./preceding-sibling::div[@class="t1"][1]//text()') if shang_: if '身份标识' in shang_[0]: # print(len(trs)) # print(trs[1]) shefenbiaoshi_dict = {} for index in range(0, len(trs)): shenfenbiaoshi_kes_ths = trs[index].findall('.//th')[0] shenfenbiaoshi_values_tds = trs[index].findall('.//td')[0] key = shenfenbiaoshi_kes_ths.text_content() value = shenfenbiaoshi_values_tds.text_content() shefenbiaoshi_dict[key] = value allDict['sfbs'] = shefenbiaoshi_dict break for i in repbody: each = i.xpath('div/text()') if '信息概要' in each: tableCount = i.xpath('table') wjqxdgy = [] # 未结清信贷及授信信息概要 yjqxdgy = [] # 已结清信贷及授信信息概要 for j in tableCount: # table1 = tostring(j, encoding='utf-8').decode('utf-8') # print(table1) trCount = len(j.xpath('tbody/tr')) # trCount = len(j.xpath('tr'))##夺秒少一个tbody标签 tr1 = chr(j.xpath('tbody/tr[1]//text()')) if '首次有信贷交易的年份' in tr1 and '发生信贷交易的机构数' in tr1: gaiyao1 = {} tr2 = chr(j.xpath('tbody/tr[2]//text()')) gaiyao1['首次有信贷交易的年份'] = tr2[0] gaiyao1['发生信贷交易的机构数'] = tr2[1] gaiyao1['当前有未结清信贷交易的机构数'] = tr2[2] gaiyao1['首次有相关还款责任的年份'] = tr2[3] allDict['gaiyao1'] = gaiyao1 continue if '借贷交易' in tr1 and '担保交易' in tr1: gaiyao2 = {} jdjy = {} dbjy = {} for a in range(2, trCount + 1): trn = chr(j.xpath('tbody/tr[{}]//text()'.format(a))) jdjy[trn[0]] = trn[1] if len(trn) == 4: dbjy[trn[2]] = trn[3] gaiyao2['借贷交易'] = jdjy gaiyao2['担保交易'] = dbjy allDict['gaiyao2'] = gaiyao2 continue if '非信贷交易账户数' in tr1 and '欠税记录条数' in tr1: gaiyao3 = {} tr2 = chr(j.xpath('tbody/tr[2]//text()')) gaiyao3['非信贷交易账户数'] = tr2[0] gaiyao3['欠税记录条数'] = tr2[1] gaiyao3['民事判决记录条数'] = t
01-11
898
04-25
9792