from tkinter import * # 导入窗口控件
import tkinter.filedialog
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser # 调用浏览器打开网页
from tkinter import messagebox # 弹出提示框
from openpyxl import Workbook
import openpyxl
import time # 延时
import random #随机
import datetime #调用时间
'''
版本:V3.0
语言环境:python 3.8
pycharm 2020.2
增加内容:
1、数据可以保存至EXCEL,并提示错误信息
2、显示爬取进度
3、测试不用代理只让读取56个企业数据
4、判断链接IP是否被封
5、加随机延迟时间,避免封IP
6、加浏览器信息,模仿浏览器读取数据
7、为Treeview 控件添加滚动条
8、添加运行时间显示
9、添另爬取简易管理排污许可证信息
10、添加按钮判断,程序正在运行中,点击按钮有提示
11、添加数据判断,在网站中有空的数据,如果预见直接跳过
12、修改总页数据的判断方式, 这样就可以不用字符串截取方式获得
13、添加用户自己可以填写名字与选择路径保存EXCEL文件
13、添加用户可以选择地区进行读取
14、发现在python 3.9版本中 int(re.sub("\D", "", urlpage)[1:]) 出错,也不知道为啥。
15、修改时间 2021.1.9 测试成功 运行83分钟
16、网站改版重新修复 2021.3.30
'''
treedata1 = [] # 全局变量用于存储查询到企业详细信息数据
treedata_jianhuaguanli=[] # 全局变量用于存储查询到简化管理企业详细信息数据
lerror2=[] #保存错误页面信息
global jishuleijia # 全局变量用于存储查询企业,进度条用数据
jishuleijia = 0
# 读取行政代码。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
shengjidaima=''
shijidaima=''
a_dict= {}
a_list=[]
b_dict= {}
b_list=[]
filepath = '行政代码区划.xlsx' # 打开已有excel数据文件
wb = openpyxl.load_workbook(filepath)
ws = wb['Sheet1']
# 表总行数
max_row = ws.max_row
# 表总列数
max_col = ws.max_column
ws2 = wb['Sheet2']
# 表总行数
max_row2 = ws2.max_row
# 表总列数
max_col2 = ws2.max_column
# def welcome(time_test):
# # messagebox.showinfo("提示", time_test)
# # print(type(time_test))
# top = Toplevel()
# top.title('Welcome')
# Message(top, text="正在延时,请等候。", padx=20, pady=20).pack()
#
# top.after(time_test, top.destroy)
for x in range(1, max_row):
cell_data = ws.cell(row=x, column=1).value #读取省级名称
cell_id=ws.cell(row=x, column=2).value #读取省级代码
a_dict.update({cell_data:cell_id}) #省级名称与代码保存入a_dict字典文件中
a_list.append(cell_data) #省级名称保存入a_list列表文件中
# 得到总页数_开始
def kaishipaqu_begin():
if treedata1 or treedata_jianhuaguanli:
messagebox.showinfo("提示", '程序正在运行中,请稍候........')
starttime = datetime.datetime.now() # 记录程序开始时间
datas = {"page.pageNo": "1",
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;", "Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Length": "141",
"Content-Type": "application/x-www-form-urlencoded",
# "Cookie": "JSESSIONID=8B40D3C75600CE7920700553EF4526AC; __guid=120853001.4108463626362789000.1591195329366.5571; _gscu_1697192173=06743647pn47tb15; viewsid=321fe86328084f7ca61707b7e3864ee5; Hm_lvt_0f50400dd25408cef4f1afb556ccb34f=1606743649; paiwu80_cookie=37836164; JSESSIONID9002C=CDDDEEB25180581FFDBC32071E89002C; es.echatsoft.com_12555_encryptVID=rCEQ7DECIUK4Rh6UyHgHmQ%3D%3D; es.echatsoft.com_12555_chatVisitorId=885531424; echat_firsturl=http%3A%2F%2Fpermit.mee.gov.cn%2FpermitExt%2Fdefaults%2Fdefault-index!getInformation.action; echat_firsttitle=%E5%85%A8%E5%9B%BD%E6%8E%92%E6%B1%A1%E8%AE%B8%E5%8F%AF%E8%AF%81%E7%AE%A1%E7%90%86%E4%BF%A1%E6%81%AF%E5%B9%B3%E5%8F%B0-%E5%85%AC%E5%BC%80%E7%AB%AF; echat_referrer_timer=echat_referrer_timeout; echat_referrer=http%3A%2F%2Fpermit.mee.gov.cn%2FpermitExt%2Foutside%2Fdefault.jsp; echat_referrer_pre=; monitor_count=9",
"DNT": "1",
"Host": "permit.mee.gov.cn",
"Origin": "http://permit.mee.gov.cn",
"Pragma": "no-cache",
"Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post(url, headers=headers, data=datas)
html = etree.HTML(r.text)
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
# print(urlpage) 得到字符串:javascript:jumpPage2(60)
zonyeshu = int(re.sub("\D", "", urlpage)[1:]) ## 截取字符串,得到总页数
print(zonyeshu)
# 得到总页数_结束
# 得到企业总个数_开始
datas = {"page.pageNo": zonyeshu,
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post(url, headers=headers, data=datas)
html = etree.HTML(r.text)
href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
# print(len(href_url)) #得到最后一页,有几个数据
zuihouyiye = int(len(href_url))
qiyezongshu = (zonyeshu - 1) * 10 + zuihouyiye # 总页数减最后一页,再加最后一页企业数,得到发放排污许可证企业数量
if not qiyezongshu: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
print(qiyezongshu)
yijinghefaqiyeshu = '已核发排放许可证:', qiyezongshu, '个.'
# 得到企业总个数_结束
# messagebox.showinfo("提示,企业总页数", zonyeshu )
# zonyeshu2=2 #调试读取页数设置
zonyeshu2 = zonyeshu + 1 # 因为变量i,是从零加在累加,如果不加1,到最后倒数第二页就截止爬取,最后一页没有数据。
# print(type(zonyeshu2))
# print(zonyeshu2)
for i in range(1, zonyeshu2):
time_random = [15,10,16, 18,17,11, 9] # 设置随机延迟访问,防止封IP
time_test = random.choice(time_random)
yanshi=time_test
print('延迟时间', time_test)
for aa in range(yanshi):
time.sleep(1)
l2["text"] = '延迟时间:' + str(yanshi) + " / " + str(aa) + " 开始时间:" + starttime.strftime('%H:%M')
l2.update()
# time.sleep(time_test)
# welcome(time_test)
datas = {"page.pageNo": i,
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
# s = requests.session()#开启session保持状态
url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post(url, headers=headers, data=datas)
html = etree.HTML(r.text)
paiwuxuke_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href') # 得到排污许可证详细页面链接
paiwuxuke_id = html.xpath('//table[@class="tabtd"]/tr/td[@class="font-green"]/text()') # 得到排污许可证号码
paiwuxuke_name = html.xpath(
'//table[@class="tabtd"]/tr/td[@style="text-align: left;padding-left: 5px;"]/text()') # 得到企业名称
if not paiwuxuke_url: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
print(paiwuxuke_id)
print(paiwuxuke_name)
messagebox.showinfo("提示" )
# cookies=r.cookies
# treedata1.append([paiwuxuke_id,paiwuxuke_name,paiwuxuke_url]) #全局变量中存储查询到企业的详细信息
z = len(paiwuxuke_name)
for i in range(z):
time_random = [15, 10, 16, 18, 17, 11, 19] # 设置随机延迟访问,防止封IP
time_test = random.choice(time_random)
yanshi=time_test
print('延迟时间', time_test)
for aa in range(yanshi):
time.sleep(1)
l2["text"] = '延迟时间:' + str(yanshi) + " / " + str(aa) + " 开始时间:" + starttime.strftime('%H:%M')
l2.update()
# time.sleep(time_test)
# welcome(time_test)
# print('延迟时间:', time_test)
# print(paiwuxuke_name[i])
# 删除列表tree2中所有数据
# x = tree2.get_children()
# for item in x:
# tree2.delete(item)
global jishuleijia
tree2.insert("",'0', jishuleijia, text="",
values=(jishuleijia, paiwuxuke_id[i], paiwuxuke_name[i],paiwuxuke_url[i] )) # 在TREE列表中显示查询到企业的信息
#time.sleep(1) # 设计延时2秒
tree2.update()
# 全局变量用于存储查询企业,进度条用数据
jishuleijia = jishuleijia + 1
print(jishuleijia, "====", qiyezongshu)
l["text"] = '' + str(jishuleijia) + '/' + str(qiyezongshu)
l.update()
addurl = paiwuxuke_url[i]
##addurl = addurl[39:93]
addurl=addurl[-32:]
#messagebox.showinfo("提示111111111", addurl)
name = paiwuxuke_name[i]
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;", "Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "permit.mee.gov.cn",
"Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
# datas = {"xkgk": "getxxgkContent",
# "dataid": addurl}
# url = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=" + addurl
xkgk = 'getxxgkContent'
url = f'http://permit.mee.gov.cn/perxxgkinfo/xkgkAction!xkgk.action?xkgk={xkgk}&dataid={addurl}'
rep = requests.get(url)
# soup = etree.HTML(rep.text)
soup = BeautifulSoup(rep.text, 'lxml')
name_id = soup.find_all('p', style="font-size:36px;")[0].text # 得到企业名称
# if not name_id: # 判断IP是否被封,如果被封程序暂停。
# input("IP错误,请更换!!!!!")
name_add = soup.find_all('p', style="font-weight: bold;color: green;font-size: 14px;")[
0].text # 得到企业地址等信息 ..strip() 属性删除空格
content = name_add
content = content.strip() # 删除字符串左边空格
content = content.split() # 拆分字符串,通过指定分隔符对字符串进行分割,默认是空格。rstrip("\xa0\xa0\xa0\xa0\r\n\t\t\t")
# content=content.partition(":")
str2 = ''.join(content)
u1, u2, u3, u4, u5 = str2.split(':', 4)
f1 = u2.find('行业类别')
f2 = u2[0:f1]
g1 = u3.find('所在地区')
g2 = u3[0:g1]
h1 = u4.find('发证机关')
h2 = u4[0:h1]
# ii = str(i)
paiwuxukebianhao = soup.find_all('table', class_="tab0")[0].text.strip().replace('\n', '').replace('\r',
'').replace(
' ', '') # 删除列表中的'\n'和空格
paiwuxkzid = paiwuxukebianhao[19:42] # 排污许可证编号
paiwuxukedata = paiwuxukebianhao[45:56] # 排污许可证审批时间
paiwuxukeyouxiaoqi = paiwuxukebianhao[69:79] # 排污许可证有效期
# 查询水污染物排放量................................................................
# addurl = addurl[22:] # 去除 ‘getxxgkContent&dataid=’ 字符串,得到ID
url_idshui = addurl + '&isVersion=&operate=readonly' # 水污物页面
urlshui = "http://permit.mee.gov.cn/perxxgkinfo/xkgkAction!xkgk.action?xkgk=approveWater_xkzgk&dataid=" + url_idshui
html = requests.get(urlshui).text # 得到水污染物页面HTML源码
html = etree.HTML(html) # 进行格式转换,否则出错
shuicod = html.xpath('//table[@id="fswrwinfo4"]/tr[3]/td[1]/text()') # 得到全厂COD年排放量
shuiandan = html.xpath('//table[@id="fswrwinfo4"]/tr[4]/td[1]/text()') # 得到全厂氨氮年排放量
shuicod = "".join(shuicod).replace('\n', '').replace('\r', '').replace('\t', '')
shuiandan = "".join(shuiandan).replace('\n', '').replace('\r', '').replace('\t', '') # 由list类型转换为字符串类型
# print('COD年排放量:', shuicod)
# print('氨氮年排放量:', shuiandan)
# 查询大气污染物排放量............................................................
urldaqi = "http://permit.mee.gov.cn/perxxgkinfo/xkgkAction!xkgk.action?xkgk=approveAtmosphere_xkzgk&dataid=" + url_idshui
html = requests.get(urldaqi).text # 得到水污染物页面HTML源码
html = etree.HTML(html) # 进行格式转换,否则出错
shuiso2 = html.xpath('//table[@id="spenterair"]/tr[4]/td[1]/text()') # 得到全厂二氧化硫年排放量
shuidanyanghuawu = html.xpath('//table[@id="spenterair"]/tr[5]/td[1]/text()') # 得到全厂氮氧化物年排放量
shuiso2 = "".join(shuiso2).replace('\n', '').replace('\r', '').replace('\t', '')
shuidanyanghuawu = "".join(shuidanyanghuawu).replace('\n', '').replace('\r', '').replace('\t',
'') # 由list类型转换为字符串类型
# print('二氧化硫年排放量:', shuicod)
# print('氮氧化物年排放量:', shuiandan)
treedata1.append(
[jishuleijia, name, f2, g2, h2, u5, paiwuxkzid, paiwuxukedata, paiwuxukeyouxiaoqi, shuicod, shuiandan,
shuiso2, shuidanyanghuawu]) # 全局变量中存储查询到企业的详细信息
tree.insert("", '0',jishuleijia, text="", values=(
jishuleijia, name, f2, g2, h2, u5, paiwuxkzid, paiwuxukedata, paiwuxukeyouxiaoqi, shuicod, shuiandan,
shuiso2, shuidanyanghuawu)) # 在TREE列表中显示查询到企业的详细信息
tree.update()
# if jishuleijia==80 or jishuleijia==160 or jishuleijia==240:
# messagebox.showinfo("数据到了80个,需要休息", "停止10分钟")
# print("数据到了80个,需要休息")
# time.sleep(600)
# print("不要急正在爬取内容...一共" + str(urlpageidstr) + "现在第" + str(i) + "页")
# time_random = [15,10,16, 18,17,11, 19] # 设置随机延迟访问,防止封IP
# time_test = random.choice(time_random)
# yanshi = time_test
# print('延迟时间', time_test)
# for aa in range(yanshi):
# time.sleep(1)
# l2["text"] = '延迟时间:' + str(yanshi) + " / " + str(aa)
# l2.update()
# messagebox.showinfo("提示", "数据查询完毕,共 "+str(datasum)+" 条记录。")
messagebox.showinfo("提示", "恭喜,所有数据都已准备完毕!请保存excel文件")
endtime = datetime.datetime.now()
seconds = (endtime - starttime).seconds
start = starttime.strftime('%Y-%m-%d %H:%M')
# 100 秒
# 分钟
minutes = seconds // 60
second = seconds % 60
print((endtime - starttime))
timeStr = str(minutes) + '分钟' + str(second) + "秒"
print("程序从 " + start + ' 开始运行,运行时间为:' + timeStr)
l2["text"] = '程序共运行时间:' + timeStr
l2.update()
def tree_click(event):
if not tree2.item(tree2.selection(), 'values'): # 判断tree2控件中是否有数据。
messagebox.showinfo("提示", '现在还没有数据!')
else:
item_text = tree2.item(tree2.selection(), 'values')[1]
messagebox.showinfo("提示", "你所选择的数据是:" + item_text)
# webbrowser.open_new_tab('http://permit.mee.gov.cn' + item_text) # 打开链接
def treesave():
try:
if treedata1: # 判断是否爬取到数据,是否需要保存excel文件
# wb = openpyxl.load_workbook('paiwuxuke2020.xlsx')
# ws = wb['Sheet']
# aa = len(tree.get_children())
biaoti = [['序号', '企业名称', '生产经营场所地址', '行业类别', '所在地区', '发证机关', '许可证编号', '办结日期', '有效期限', 'COD年排放量', '氨氮年排放量', '二氧化硫年排放量', '氮氧化物年排放量']]
wb = Workbook()
wb1 = wb.create_sheet('index', 0)
wb1.title = '重点管理数据'
filename = tkinter.filedialog.asksaveasfilename(filetypes=[('xlsx', '*.xlsx')], initialdir='D:\\')
filename = filename + '.xls'
for row2 in range(len(biaoti)):
wb1.append(biaoti[row2])
for row in range(len(treedata1)):
wb1.append(treedata1[row])
wb.save(filename)
messagebox.showinfo("提示", "paiwuxuke2020EXCEL保存完毕~!!!")
else:
messagebox.showinfo("提示", '没有数据,不必保存')
except:
messagebox.showinfo("提示", '保存文件错误,请重试~!!')
def jiayiguanli_save():
try:
if treedata_jianhuaguanli: # 判断是否爬取到数据,是否需要保存excel文件
# wb = openpyxl.load_workbook('paiwuxukejianyiguanli.xlsx')
# ws = wb['Sheet1']
# aa = len(tree.get_children())
biaoti = [['序号', '许可证编号', '企业名称', '行业类别', '有效期限', '登记时间', '详细链接']]
wb = Workbook()
wb1 = wb.create_sheet('index', 0)
wb1.title = '简易管理数据'
filename = tkinter.filedialog.asksaveasfilename(filetypes=[('xlsx', '*.xlsx')], initialdir='D:\\')
filename = filename + '.xls'
for row2 in range(len(biaoti)):
wb1.append(biaoti[row2])
for row in range(len(treedata_jianhuaguanli)):
wb1.append(treedata_jianhuaguanli[row])
#wb.save("paiwuxukejianyiguanli.xlsx")
wb.save(filename)
messagebox.showinfo("提示", "paiwuxukejianyiguanli.xlsx保存完毕~!!!")
else:
messagebox.showinfo("提示", '没有数据,不必保存')
except:
messagebox.showinfo("提示", '保存文件错误,请重试~!!')
def kaishipaqu_jiahuaguanli(): #简化管理信息
if treedata1 or treedata_jianhuaguanli:
messagebox.showinfo("提示", '程序正在运行中,请稍候........')
starttime = datetime.datetime.now() # 记录程序开始时间
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Content-Length": "141",
"Content-Type": "application/x-www-form-urlencoded",
# "Cookie: JSESSIONID=A399E6243CDF891FF0DECE910B5C1301; _gscu_1697192173=08947271qw7xsd90; Hm_lvt_0f50400dd25408cef4f1afb556ccb34f=1608947274
"Host": "permit.mee.gov.cn",
"Origin": "http://permit.mee.gov.cn",
"Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!getRegisterInfo.action",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
}
datas = {"page.pageNo": "1",
"province": shengjidaima,
"city": shijidaima,
}
url='http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!getRegisterInfo.action'
r = requests.post(url, data=datas, headers=headers)
html = etree.HTML(r.text)
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
#messagebox.showinfo("提示", urlpage)
zonyeshu = int(re.sub("\D", "", urlpage)[1:]) ## 截取字符串,得到总页数
#zonyeshu = int(urlpage[21:24]) # 截取字符串,得到总页数
if not zonyeshu: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
print(zonyeshu)
# 得到总页数_结束
# 得到企业总个数_开始
datas = {"page.pageNo": zonyeshu,
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
r = requests.post(url, data=datas,headers=headers)
html = etree.HTML(r.text)
href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
# print(len(href_url)) #得到最后一页,有几个数据
zuihouyiye = int(len(href_url))
qiyezongshu = (zonyeshu - 1) * 10 + zuihouyiye # 总页数减最后一页,再加最后一页企业数,得到朝阳市发放排污许可证企业数量
if not qiyezongshu: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
print(qiyezongshu)
yijinghefaqiyeshu = '已核发排放许可证:', qiyezongshu, '个.'
# 得到企业总个数_结束
# messagebox.showinfo("提示,企业总页数",zonyeshu )
#zonyeshu2=2 #调试读取页数设置
zonyeshu2 = zonyeshu + 1 # 因为变量i,是从零加在累加,如果不加1,到最后倒数第二页就截止爬取,最后一页没有数据。
for i in range(1, zonyeshu2):
time_random = [15, 10, 16, 18, 17, 11, 9] # 设置随机延迟访问,防止封IP
time_test = random.choice(time_random)
yanshi = time_test
print('延迟时间', time_test)
for aa in range(yanshi):
time.sleep(1)
l2["text"] = '延迟时间:' + str(yanshi) + " / " + str(aa) + " 开始时间:" + starttime.strftime('%H:%M')
l2.update()
datas = {"page.pageNo": i,
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url='http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!getRegisterInfo.action'
r = requests.post(url, data=datas,headers=headers)
html = etree.HTML(r.text)
paiwuxuke_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href') # 得到排污许可证详细页面链接
paiwuxuke_id = html.xpath('//table[@class="tabtd"]/tr/td[@class="font-green"]/text()') # 得到排污许可证号码
paiwuxuke_name = html.xpath(
'//table[@class="tabtd"]/tr/td[@style="text-align: left;padding-left: 5px;"]/text()') # 得到企业名称
paiwuxuke_shenpishijian=html.xpath('//table[@class="tabtd"]/tr/td[7]/text()')[1:] # 得到排污许可证审批时间
paiwuxuke_hangye = html.xpath('//table[@class="tabtd"]/tr/td[5]/text()')[1:] # 得到排污许可证行业
paiwuxuke_youxiaoqi = html.xpath('//table[@class="tabtd"]/tr/td[6]/text()')[1:] # 得到排污许可证审批时间
if not paiwuxuke_url: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
z1 = len(paiwuxuke_id)
z2 = len(paiwuxuke_name)
z3 = len(paiwuxuke_url)
z4 = len(paiwuxuke_youxiaoqi)
z5 = len(paiwuxuke_hangye)
z6 = len(paiwuxuke_shenpishijian)
if not z1 == z2 == z3 == z4 == z5 == z6:
print("发现有数据不一致的地方")
print('发现错误页面:'+str(i))
lerror2.append([i])
lerror["text"] = '程序发现错误页面:' + str(i)
lerror.update()
print(lerror2)
continue
# print(paiwuxuke_id)
# print(paiwuxuke_name)
# print(paiwuxuke_url)
# print(paiwuxuke_youxiaoqi )
# print(paiwuxuke_hangye)
# print(paiwuxuke_shenpishijian)
# messagebox.showinfo("总页数", zonyeshu2)
#treedata1.append([paiwuxuke_id,paiwuxuke_name,paiwuxuke_hangye,paiwuxuke_youxiaoqi,paiwuxuke_shenpishijian,paiwuxuke_url]) #全局变量中存储查询到企业的详细信息
z = len(paiwuxuke_name)
for i in range(z):
global jishuleijia
tree2.insert("",'0', jishuleijia, text="",values=(jishuleijia,paiwuxuke_id[i],paiwuxuke_name[i],paiwuxuke_hangye[i],paiwuxuke_youxiaoqi[i],paiwuxuke_shenpishijian[i],paiwuxuke_url[i]))#在TREE列表中显示查询到企业的详细信息
#time.sleep(1) # 设计延时2秒
treedata_jianhuaguanli.append(
[jishuleijia,paiwuxuke_id[i], paiwuxuke_name[i], paiwuxuke_hangye[i], paiwuxuke_youxiaoqi[i], paiwuxuke_shenpishijian[i],
paiwuxuke_url[i]])
jishuleijia = jishuleijia + 1
print(jishuleijia, "====", qiyezongshu)
l["text"] = '' + str(jishuleijia) + '/' + str(qiyezongshu)
l.update()
tree2.update()
# print("不要急正在爬取内容...一共" + str(urlpageidstr) + "现在第" + str(i) + "页")
# time_random = [1.5, 0.5, 0.6, 2, 3, 4, 1] # 设置随机延迟访问,防止封IP
# time_test = random.choice(time_random)
# time.sleep(time_test)
# print('延迟时间:', time_test)
endtime = datetime.datetime.now()
seconds = (endtime - starttime).seconds
start = starttime.strftime('%Y-%m-%d %H:%M')
# 100 秒
# 分钟
minutes = seconds // 60
second = seconds % 60
print((endtime - starttime))
timeStr = str(minutes) + '分钟' + str(second) + "秒"
print("程序从 " + start + ' 开始运行,运行时间为:' + timeStr)
l2["text"] = '程序共运行时间:' + timeStr
l2.update()
print(lerror2)
lerror["text"] = '程序发现错误页面:' + str(lerror2)
lerror.update()
messagebox.showinfo("提示", "错误信息页面:" + str(lerror2))
return
def xFunc(event):
#print(com.get()) # #获取选中的值方法1
#print(xVariable.get()) # #获取选中的值方法2
a_dict_str=a_dict[com.get()]
a_dict_str=str(a_dict_str)[0:2]
print(a_dict_str)
# com2.delete(0,ttk.END) #本意想清空COM列表中所有数据,但不能实现不知道为什么?????
for xx in range(1, max_row2): #读取表2地级市代码表格内容
cell_data2 = ws2.cell(row=xx, column=1).value
cell_id2 = ws2.cell(row=xx, column=3).value
cell_id2_str=str(cell_id2)[0:2] #进行字符串截取得到在COM列表框中选择省级,与地市级进行关联
if cell_id2_str == a_dict_str:
b_dict.update({cell_data2: cell_id2})
b_list.append(cell_data2)
com2["value"] = (b_list)
com2.current(1) #设置默认显示数据为第二条
def dierFunc(event):
print(com.get())
print(a_dict[com.get()])
print(com2.get())
print(b_dict[com2.get()])
global shengjidaima,shijidaima
shijidaima=b_dict[com2.get()]
shengjidaima=a_dict[com.get()]
def huoqudaima():
print(shengjidaima,shijidaima)
# 读取行政区划码结束。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
root = Tk() # 创建窗口
root.title("排污许可证数据信息")
root.geometry("850x750+500+50") # 小写x代表乘号500x400为窗口大小,+0+0窗口显示位置
lbxianshixinxi = LabelFrame(root, width=800, text='', padx=80, pady=10)#, labelanchor=W
lbxianshixinxi.grid(row=0, column=0)
l = Label(lbxianshixinxi, text='0/0', width=20) # 创建标签控件
l.grid(row=0, column=0,padx=10, pady=10,sticky=W)
l2 = Label(lbxianshixinxi, text='程序运行时间:', justify=RIGHT) # 创建标签控件
l2.grid(row=0, column=1,padx=10, pady=10,sticky=W)
lerror = Label(lbxianshixinxi, text='', width=25, fg='red',justify=RIGHT) # 创建标签控件
lerror.grid(row=0, column=2,padx=10, pady=10,sticky=W)
lbtree = LabelFrame(root, width=500, height=10, text='数据显示区域', padx=8, pady=10)
lbtree.grid(row=1, column=0)
lb3 = LabelFrame(root, width=500, height=500, text='简化管理排污许可证数据', padx=8, pady=8, foreground='red')
lb3.grid(row=7, column=0)
button3 = Button(lb3, text=" 开始爬取(简化管理) ", command=kaishipaqu_jiahuaguanli) # 创建按钮控件
button3.grid(row=5, column=0, padx=20, pady=0,sticky=W)
button4 = Button(lb3, text=" 保存列表信息数据 ", command=jiayiguanli_save) # 创建按钮控件
button4.grid(row=5, column=4, sticky=W)
button5 = Button(lb3, text=" 退 出 ", command=root.quit) # 创建按钮控件
button5.grid(row=5, column=5,padx=20, pady=0, sticky=N) #详见 https://www.cnblogs.com/ruo-li-suo-yi/p/7425307.html
lb5 = LabelFrame(root, width=500, height=500, text='选择要读取地区', padx=5, pady=5, foreground='red')
lb5.grid(row=7, column=0, sticky=W)
com = ttk.Combobox(lb5) # #创建下拉菜单
com.grid(row=1, column=0, sticky=E)
com["value"] = (a_list) # #给下拉菜单设定值
com2 = ttk.Combobox(lb5) # #创建下拉菜单
com2.grid(row=2, column=0, sticky=E)
com.bind("<<ComboboxSelected>>", xFunc) # #给下拉菜单绑定事件
com2.bind("<<ComboboxSelected>>", dierFunc)
lb4 = LabelFrame(root, width=500, height=500, text='重点管理排污许可证数据', padx=8, pady=8, foreground='red')
lb4.grid(row=8, column=0)
button = Button(lb4, text=" 开始爬取(重点管理) ", command=kaishipaqu_begin) # 创建按钮控件
button.grid(row=5, column=2, padx=20, pady=0,sticky=W)
button2 = Button(lb4, text=" 保存列表信息数据 ", command=treesave) # 创建按钮控件
button2.grid(row=5, column=4, sticky=W)
button1 = Button(lb4, text=" 退 出 ", command=root.quit) # 创建按钮控件
button1.grid(row=5, column=5, padx=20, pady=0,sticky=N)
tree2 = ttk.Treeview(lbtree, height=10, show="headings") # 表格第一列不显示
scroll2_ty = Scrollbar(root, orient=VERTICAL, command=tree2.yview) # 添加滚动条
scroll2_ty.grid(row=1, column=2, sticky=N + S)
tree2['yscrollcommand'] = scroll2_ty.set
scroll2_tx = Scrollbar(root, orient=HORIZONTAL, command=tree2.xview)
scroll2_tx.grid(row=3, column=0, sticky=E + W)
tree2['xscrollcommand'] = scroll2_tx.set
tree2.grid(row=1, columnspan=1)
tree2["columns"] = ('序号', '许可证编号', '企业名称', '行业类别', '有效期限', '登记时间', '详细链接')
# 设置列,不显示
tree2.column("序号", width=100)
tree2.column("许可证编号", width=200)
tree2.column("企业名称", width=100)
tree2.column("行业类别", width=100)
tree2.column("有效期限", width=100)
tree2.column("登记时间", width=100)
tree2.column("详细链接", width=100)
# 显示表头
tree2.heading("序号", text="序号")
tree2.heading("许可证编号", text="许可证编号")
tree2.heading("企业名称", text="企业名称")
tree2.heading("行业类别", text="行业类别")
tree2.heading("有效期限", text="有效期限")
tree2.heading("登记时间", text="登记时间")
tree2.heading("详细链接", text="详细链接")
tree = ttk.Treeview(root, show="headings") # 表格第一列不显示
scroll_ty = Scrollbar(root, orient=VERTICAL, command=tree.yview) # 添加滚动条
scroll_ty.grid(row=4, column=2, sticky=N + S)
tree['yscrollcommand'] = scroll_ty.set
scroll_tx = Scrollbar(root, orient=HORIZONTAL, command=tree.xview)
scroll_tx.grid(row=5, column=0, sticky=E + W)
tree['xscrollcommand'] = scroll_tx.set
tree.grid(row=4, columnspan=1)
tree["columns"] = (
'序号', '企业名称', '生产经营场所地址', '行业类别', '所在地区', '发证机关', '许可证编号', '办结日期', '有效期限', 'COD年排放量', '氨氮年排放量', '二氧化硫年排放量', '氮氧化物年排放量')
# 设置列,不显示
tree.column("序号", width=50)
tree.column("企业名称", width=100)
tree.column("生产经营场所地址", width=50)
tree.column("行业类别", width=100)
tree.column("所在地区", width=50)
tree.column("发证机关", width=50)
tree.column("许可证编号", width=100)
tree.column("办结日期", width=50)
tree.column("有效期限", width=50)
tree.column("COD年排放量", width=50)
tree.column("氨氮年排放量", width=50)
tree.column("二氧化硫年排放量", width=50)
tree.column("氮氧化物年排放量", width=50)
# 显示表头
tree.heading("序号", text="序号")
tree.heading("企业名称", text="企业名称")
tree.heading("生产经营场所地址", text="生产经营场所地址")
tree.heading("行业类别", text="行业类别")
tree.heading("所在地区", text="所在地区")
tree.heading("发证机关", text="发证机关")
tree.heading("许可证编号", text="许可证编号")
tree.heading("办结日期", text="办结日期")
tree.heading("有效期限", text="有效期限")
tree.heading("COD年排放量", text="COD年排放量")
tree.heading("氨氮年排放量", text="氨氮年排放量")
tree.heading("二氧化硫年排放量", text="二氧化硫年排放量")
tree.heading("氮氧化物年排放量", text="氮氧化物年排放量")
tree2.bind('<ButtonRelease>', tree_click) # 列表框绑定鼠标事件函数
root.mainloop() # 显示窗口 mainloop 消息循环
2021-03-31
最新推荐文章于 2024-07-24 23:49:50 发布