补充实现功能如下: 2020.6.25
1、对已存在数据文件进行判断,如果没有重新建立,如果有对数据进行追回
2、手工设置爬取延时时间
from tkinter import * # 导入窗口控件
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser # 调用浏览器打开网页
from tkinter import messagebox # 弹出提示框
from openpyxl import Workbook
import openpyxl
import time # 延时
global exitbiaozhi # 定义的是否退出FOR标志
treedata1 = [] # 全局变量用于存储查询到企业详细信息数据
def download_song():
# 打开查询页面,得到页面总数....................................................
datas = {"page.pageNo": "1",
"page.orderBy": "",
"page.order": "",
"province": "210000000000",
"city": "211300000000",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
r = requests.post(url, data=datas)
html = etree.HTML(r.text)
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
urlpageidstr = urlpage[21:23] # 截取字符串,得到总页数
print(urlpage)
print(urlpageidstr)
# 开始爬取所有页面数据
start_page = 1
# pagesum = urlpageidstr
urlpageidstr = (int(urlpageidstr))
# urlpageidstr = 2 # 调试数据暂定为2页,提高效率
datasum = 1 # 记录爬取数据个数
pagesum2 = urlpageidstr * 10
messagebox.showinfo("提示", "数据正在读取请稍候。。。。")
filepath = 'paiwuxuke.xlsx' # 打开已有excel数据文件
wb = openpyxl.load_workbook(filepath)
ws = wb['Sheet']
# 表总行数
max_row = ws.max_row
# 表总列数
max_col = ws.max_column
global exitbiaozhi
enter1['state'] = 'readonly' #输入延时输入框变不可修改数据
for page in range(start_page, urlpageidstr):
content = page
# 打开查询页面,得到每个具体企业信息的链接....................................................
urlpage = content
datas = {"page.pageNo": urlpage,
"page.orderBy": "",
"page.order": "",
"province": "210000000000",
"city": "211300000000",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
r = requests.post(url, data=datas)
html = etree.HTML(r.text)
href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
href_name = html.xpath('//table[@class="tabtd"]/tr')[1:]
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
# urlpageidstr = urlpage[21:23] # 截取字符串,得到总页数
# print(urlpageidstr)
i = 0
# 打开查询页面,得到第N个企业的详细 信息....................................................
for href_url, roos in zip(href_url, href_name):
addurl = href_url
name = roos.xpath('./td[4]/text()')[0]
i = i + 1
addurl = addurl[39:93]
datas = {"xkgk": "getxxgkContent",
"dataid": addurl}
url = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=" + addurl
html = requests.get(url, headers=datas)
soup = BeautifulSoup(html.text, 'lxml')
name_id = soup.find_all('p', style="font-size:36px;")[0].text # 得到企业名称
name_add = soup.find_all('p', style="font-weight: bold;color: green;font-size: 14px;")[
0].text # 得到企业地址等信息 ..strip() 属性删除空格
content = name_add
content = content.strip() # 删除字符串左边空格
content = content.split() # 拆分字符串,通过指定分隔符对字符串进行分割,默认是空格。rstrip("\xa0\xa0\xa0\xa0\r\n\t\t\t")
# content=content.partition(":")
str2 = ''.join(content)
u1, u2, u3, u4, u5 = str2.split(':', 4)
f1 = u2.find('行业类别')
f2 = u2[0:f1]
g1 = u3.find('所在地区')
g2 = u3[0:g1]
h1 = u4.find('发证机关')
h2 = u4[0:h1]
# ii = str(i)
paiwuxukebianhao = soup.find_all('table', class_="tab0")[0].text.strip().replace('\n', '').replace('\r',
'').replace(
' ', '') # 删除列表中的'\n'和空格
paiwuxkzid = paiwuxukebianhao[19:41] # 排污许可证编号
paiwuxukedata = paiwuxukebianhao[45:56] # 排污许可证审批时间
paiwuxukeyouxiaoqi = paiwuxukebianhao[69:79] # 排污许可证有效期
# 查询水污染物排放量................................................................
addurl = addurl[22:] # 去除 ‘getxxgkContent&dataid=’ 字符串,得到ID
url_idshui = addurl + '&isVersion=&operate=readonly' # 水污物页面
urlshui = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=approveWater_xkzgk&dataid=" + url_idshui
html = requests.get(urlshui).text # 得到水污染物页面HTML源码
html = etree.HTML(html) # 进行格式转换,否则出错
shuicod = html.xpath('//table[@id="fswrwinfo4"]/tr[3]/td[1]/text()') # 得到全厂COD年排放量
shuiandan = html.xpath('//table[@id="fswrwinfo4"]/tr[4]/td[1]/text()') # 得到全厂氨氮年排放量
shuicod = "".join(shuicod).replace('\n', '').replace('\r', '').replace('\t', '')
shuiandan = "".join(shuiandan).replace('\n', '').replace('\r', '').replace('\t', '') # 由list类型转换为字符串类型
# print('COD年排放量:', shuicod)
# print('氨氮年排放量:', shuiandan)
# 查询大气污染物排放量............................................................
urldaqi = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=approveAtmosphere_xkzgk&dataid=" + url_idshui
html = requests.get(urldaqi).text # 得到水污染物页面HTML源码
html = etree.HTML(html) # 进行格式转换,否则出错
shuiso2 = html.xpath('//table[@id="spenterair"]/tr[4]/td[1]/text()') # 得到全厂二氧化硫年排放量
shuidanyanghuawu = html.xpath('//table[@id="spenterair"]/tr[5]/td[1]/text()') # 得到全厂氮氧化物年排放量
shuiso2 = "".join(shuiso2).replace('\n', '').replace('\r', '').replace('\t', '')
shuidanyanghuawu = "".join(shuidanyanghuawu).replace('\n', '').replace('\r', '').replace('\t',
'') # 由list类型转换为字符串类型
# print('二氧化硫年排放量:', shuicod)
# print('氮氧化物年排放量:', shuiandan)
if shuicod == '/' or shuicod == '':
shuicod = 0
if shuiandan == '/' or shuiandan == '':
shuiandan = 0
if shuiso2 == '/' or shuiso2 == '':
shuiso2 = 0
if shuidanyanghuawu == '/' or shuidanyanghuawu == '':
shuidanyanghuawu = 0
shuicod = int(float(shuicod))
shuiandan = int(float(shuiandan))
shuiso2 = int(float(shuiso2))
shuidanyanghuawu = int(float(shuidanyanghuawu))
for x in range(1, max_row):
# 获取表中x行1列的值
cell_data = ws.cell(row=x, column=7).value
if cell_data == paiwuxkzid: # 进行判断,如果excel文件中没有该数据进行追回
exitbiaozhi = 'ok'
#print(exitbiaozhi)
break
exitbiaozhi='no'
if exitbiaozhi=='ok':
break
treedata1.append(
[datasum, name, f2, g2, h2, u5, paiwuxkzid, paiwuxukedata, paiwuxukeyouxiaoqi, shuicod, shuiandan,
shuiso2, shuidanyanghuawu]) # 全局变量中存储查询到企业的详细信息
tree.insert("", i, text="", values=(
datasum, name, f2, g2, h2, u5, paiwuxkzid, paiwuxukedata, paiwuxukeyouxiaoqi, shuicod, shuiandan,
shuiso2, shuidanyanghuawu)) # 在TREE列表中显示查询到企业的详细信息
# print("不要急正在爬取内容...估计一共" + pagesum2 + "条, 现在第" + str(datasum) + "条")
time.sleep(timeyanshi) # 设计延时3秒
tree.update()
datasum = datasum + 1
if exitbiaozhi == 'ok':
messagebox.showinfo("提示", "截至目前,没有新增企业")
break
def treesave():
if treedata1: #判断是否爬取到数据,是否需要保存excel文件
wb = openpyxl.load_workbook('paiwuxuke.xlsx')
ws = wb['Sheet']
aa = len(tree.get_children())
for row in range(len(treedata1)):
ws.append(treedata1[row])
wb.save("paiwuxuke.xlsx")
messagebox.showinfo("提示", "EXCEL保存完毕~!!!")
return
messagebox.showinfo("提示", '没有数据,不必保存')
return
root = Tk() # 创建窗口
root.title("label-test")
root.geometry("1324x468+100+100") # 小写x代表乘号500x400为窗口大小,+0+0窗口显示位置
l = Label(root, text="请输入爬取数据间隔秒数:间隔越长越不容易被排污许可证公示网站封IP,但爬取越慢。") # 创建标签控件
l.grid() # 标签控件以表格形式显示在窗口左上角
enter1 = Entry(root) # 创建输入框控件
enter1.grid(row=1, column=0) # 位置显示在0行,1列。
enter1.insert(0, "5")
timeyanshi=int(enter1.get())
print(timeyanshi)
tree = ttk.Treeview(root, show="headings") # 表格第一列不显示
# tree.pack()
tree.grid(row=2, columnspan=2)
tree["columns"] = (
'序号', '企业名称', '生产经营场所地址', '行业类别', '所在地区', '发证机关', '许可证编号', '办结日期', '有效期限', 'COD年排放量', '氨氮年排放量', '二氧化硫年排放量', '氮氧化物年排放量')
# 设置列,不显示
tree.column("序号", width=100)
tree.column("企业名称", width=100)
tree.column("生产经营场所地址", width=100)
tree.column("行业类别", width=100)
tree.column("所在地区", width=100)
tree.column("发证机关", width=100)
tree.column("许可证编号", width=100)
tree.column("办结日期", width=100)
tree.column("有效期限", width=100)
tree.column("COD年排放量", width=50)
tree.column("氨氮年排放量", width=50)
tree.column("二氧化硫年排放量", width=50)
tree.column("氮氧化物年排放量", width=50)
# 显示表头
tree.heading("序号", text="序号")
tree.heading("企业名称", text="企业名称")
tree.heading("生产经营场所地址", text="生产经营场所地址")
tree.heading("行业类别", text="行业类别")
tree.heading("所在地区", text="所在地区")
tree.heading("发证机关", text="发证机关")
tree.heading("许可证编号", text="许可证编号")
tree.heading("办结日期", text="办结日期")
tree.heading("有效期限", text="有效期限")
tree.heading("COD年排放量", text="COD年排放量")
tree.heading("氨氮年排放量", text="氨氮年排放量")
tree.heading("二氧化硫年排放量", text="二氧化硫年排放量")
tree.heading("氮氧化物年排放量", text="氮氧化物年排放量")
"""
定义滚动条控件
orient为滚动条的方向,vertical--纵向,horizontal--横向
command=self.tree.yview 将滚动条绑定到treeview控件的Y轴
"""
scroll_ty = Scrollbar(root, orient=VERTICAL, command=tree.yview)
scroll_ty.grid(row=2, column=2, sticky=N + S)
tree['yscrollcommand'] = scroll_ty.set
scroll_tx = Scrollbar(root, orient=HORIZONTAL, command=tree.xview)
scroll_tx.grid(row=3, column=0, sticky=E + W)
tree['xscrollcommand'] = scroll_tx.set
def tree_click(event):
item_text = tree.item(tree.selection(), 'values')[1]
messagebox.showinfo("提示", "你所选择的数据是:" + item_text)
# webbrowser.open_new_tab('http://permit.mee.gov.cn' + item_text) # 打开链接
# 列表框绑定鼠标事件函数
tree.bind('<ButtonRelease>', tree_click)
button = Button(root, text="开始爬取", command=download_song) # 创建按钮控件
button.grid(row=6, column=0, sticky=W) # 位置显示在2行,0列,对齐方式 W ,N,S,E
button1 = Button(root, text="退出", command=root.quit) # 创建按钮控件
button1.grid(row=6, column=1, sticky=W) # 位置显示在2行,0列,对齐方式 W ,N,S,E
button2 = Button(root, text="保存excel文件", command=treesave) # 创建按钮控件
button2.grid(row=7, column=0, sticky=W) # 位置显示在2行,0列,对齐方式 W ,N,S,E
try:
f =open('paiwuxuke.xlsx')
f.close()
except FileNotFoundError:
wb = Workbook()
ws = wb.active
aa=len(tree.get_children())
tableTitle = ['序号','企业名称','生产经营场所地址','行业类别','所在地区','发证机关','许可证编号','办结日期','有效期限','COD年排放量','氨氮年排放量','二氧化硫年排放量','氮氧化物年排放量']
for col in range(len(tableTitle)):
c = col + 1
ws.cell(row=1, column=c).value = tableTitle[col]
wb.save("paiwuxuke.xlsx")
messagebox.showinfo("提示", '未发现数据文件。程序运行后将在文件目录下建立paiwuxuke.xlsx文件!')
root.mainloop() # 显示窗口 mainloop 消息循环