#coding=gbk
import numpy as np
import pandas as pd
from lxml import etree
import csv
import requests
col = "ABCDEFGHIJKLM"
row = 1
nameArray1 = np.array([])
nameArray3 = np.array([])
nameArray4 = np.array([])
nameArray5 = np.array([])
nameArray6 = np.array([])
nameArray7 = np.array([])
nameArray8 = np.array([])
nameArray10 = np.array([])
nameArray11 = np.array([])
nameArray12 = np.array([])
nameArray13 = np.array([])
nameArray14 = np.array([])
nameArray15 = np.array([])
nameArray16 = np.array([])
nameArray17 = np.array([])
nameArrayEnd1 = np.array([])
nameArrayEnd3 = np.array([])
def getData(idUrl):
global nameArray1
global nameArray3
global nameArray4
global nameArray5
global nameArray6
global nameArray7
global nameArray8
global nameArray10
global nameArray11
global nameArray12
global nameArray13
global nameArray14
global nameArray15
global nameArray16
global nameArray17
global nameArrayEnd1
global nameArrayEnd3
ht = 1
while(1):
try:
ht = requests.get(url = idUrl, timeout=(5,15))
except:
print("wait "+idUrl)
continue
if ht.status_code == 200:
break
else:
print(ht)
print("wait "+idUrl)
html=etree.HTML(ht.text)
#html = etree.parse(txt, etree.HTMLParser(encoding='gbk'))
#res = html.xpath('//table[@class ="detailTable"]/tbody/tr//td')
res = html.xpath('//td')
faren = ""
try:
faren = res[14].text
except:
return;
res = html.xpath('//td[@align = "center"]')
i = 0
end = 2000
'''
nameArray1 = np.append(nameArray1, res[1].text.replace(' ', ''))
nameArray3 = np.append(nameArray3, res[3].text.replace(' ', ''))
nameArray4 = np.append(nameArray4, res[4].text.replace(' ', ''))
nameArray5 = np.append(nameArray5, res[5].text.replace(' ', ''))
nameArray6 = np.append(nameArray6, res[6].text.replace(' ', ''))
nameArray7 = np.append(nameArray7, faren.replace(' ', ''))
'''
for a in res:
#print(i)
i += 1
if a.text is not None:
astr = a.text.replace(' ', '')
#astr
#print(astr)
if astr== ('安全许可信息'):
end = i
if i > end + 3:
break
#print("eeeeeeeeeeeeeeeeeeee" + str(end))
i = 7
for m in range(int(end/10)):
if m == 0:
print(res[1].text.replace(' ', ''))
nameArray1 = np.append(nameArray1, res[1].text.replace(' ', ''))
nameArray3 = np.append(nameArray3, res[3].text.replace(' ', ''))
nameArray4 = np.append(nameArray4, res[4].text.replace(' ', ''))
nameArray5 = np.append(nameArray5, res[5].text.replace(' ', ''))
nameArray6 = np.append(nameArray6, res[6].text.replace(' ', ''))
nameArray7 = np.append(nameArray7, faren.replace(' ', ''))
nameArrayEnd1 = np.append(nameArrayEnd1, res[end + 1].text.replace(' ', ''))
nameArrayEnd3 = np.append(nameArrayEnd3, res[end + 3].text.replace(' ', ''))
else:
nameArray1 = np.append(nameArray1, '')
nameArray3 = np.append(nameArray3, '')
nameArray4 = np.append(nameArray4, '')
nameArray5 = np.append(nameArray5, '')
nameArray6 = np.append(nameArray6, '')
nameArray7 = np.append(nameArray7, '')
nameArrayEnd1 = np.append(nameArrayEnd1, '')
nameArrayEnd3 = np.append(nameArrayEnd3, '')
i = i + 1
nameArray8 = np.append(nameArray8, res[i].text.replace(' ', ''))
i = i + 2
nameArray10 = np.append(nameArray10, res[i].text.replace(' ', ''))
i = i + 1
nameArray11 = np.append(nameArray11, res[i].text.replace(' ', ''))
i = i + 1
nameArray12 = np.append(nameArray12, res[i].text.replace(' ', ''))
i = i + 1
nameArray13 = np.append(nameArray13, res[i].text.replace(' ', ''))
i = i + 1
nameArray14 = np.append(nameArray14, res[i].text.replace(' ', ''))
i = i + 1
nameArray15 = np.append(nameArray15, res[i].text.replace(' ', ''))
i = i + 1
try:
nameArray16 = np.append(nameArray16, res[i].text.replace(' ', ''))
except:
nameArray16 = np.append(nameArray16, res[i].text)
print("res" + str(res[i].text))
i = i + 1
try:
nameArray17 = np.append(nameArray17, res[i].text.replace(' ', ''))
except:
nameArray17 = np.append(nameArray17, res[i].text)
print("res" + str(res[i].text))
global yema
writefile = "shuju" + str(yema) +".csv"
data = [nameArray1, nameArray3, nameArray4, nameArray5, nameArray6, nameArray7, \
nameArray8, nameArray10, nameArray11, nameArray12, nameArray13, nameArray14, \
nameArray15, nameArray16, nameArray17, nameArrayEnd1, nameArrayEnd3]
data = np.transpose(data)
ser2 = pd.DataFrame(data, columns=['企业名称', '营业证', '地址', '注册资本', '组织机构代码号', '法人', \
'序号', '编号', '日期', '状态', '资质序列', '类别', '等级', '发证单位', '核准日期', '证书', '有效期'])
#ser2 = pd.DataFrame(data)
#print("write file")
ser2.to_csv(writefile, encoding="utf_8_sig")
import sys
yema = 0
def run():
global yema
start = int(sys.argv[1])
end = int(sys.argv[2])
#start = 168
#end = 2000
#if int(yema) > 3:
# print("> 3")
# return
#yema = 3
global nameArray1
global nameArray3
global nameArray4
global nameArray5
global nameArray6
global nameArray7
global nameArray8
global nameArray10
global nameArray11
global nameArray12
global nameArray13
global nameArray14
global nameArray15
global nameArray16
global nameArray17
global nameArrayEnd1
global nameArrayEnd3
for i in range(start, end+1):
yema = i
url = "http://124.115.170.171:7001/PDR/network/informationSearch/informationSearchList?&pageNumber=" + str(yema)
try:
res = requests.get(url = url, timeout=(5,15))
except:
print("waiting..." + url)
i = i - 1
continue
print(res.status_code)
#print(res.text)
html=etree.HTML(res.text)
aRes = html.xpath('//td/p/a/@onclick')
t = 0
for a in enumerate(aRes):
#print(type(a[1]))
s = a[1].find(',')
aid = a[1][s+2:]
#print(a[1])
e = aid.find(',')
aid = aid[:e-2]
#print(aid)
idUrl = 'http://124.115.170.171:7001/PDR/network/Enterprise/Informations/view?enid=' + aid
getData(idUrl)
nameArray1 = np.array([])
nameArray3 = np.array([])
nameArray4 = np.array([])
nameArray5 = np.array([])
nameArray6 = np.array([])
nameArray7 = np.array([])
nameArray8 = np.array([])
nameArray10 = np.array([])
nameArray11 = np.array([])
nameArray12 = np.array([])
nameArray13 = np.array([])
nameArray14 = np.array([])
nameArray15 = np.array([])
nameArray16 = np.array([])
nameArray17 = np.array([])
nameArrayEnd1 = np.array([])
nameArrayEnd3 = np.array([])
if __name__ == '__main__':
run()