1、补充可以自由选择省、市进行爬取数据
2、补充使用代理IP访问
from tkinter import * # 导入窗口控件
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser # 调用浏览器打开网页
from tkinter import messagebox # 弹出提示框
from openpyxl import Workbook
import openpyxl
import time # 延时
import random
global exitbiaozhi # 定义的是否退出FOR标志
treedata1 = [] # 全局变量用于存储查询到企业详细信息数据
# 读取行政代码。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
shengjidaima=''
shijidaima=''
a_dict= {
}
a_list=[]
b_dict= {
}
b_list=[]
filepath = '行政代码区划.xlsx' # 打开已有excel数据文件
wb = openpyxl.load_workbook(filepath)
ws = wb['Sheet1']
# 表总行数
max_row = ws.max_row
# 表总列数
max_col = ws.max_column
ws2 = wb['Sheet2']
# 表总行数
max_row2 = ws2.max_row
# 表总列数
max_col2 = ws2.max_column
for x in range(1, max_row):
cell_data = ws.cell(row=x, column=1).value #读取省级名称
cell_id=ws.cell(row=x, column=2).value #读取省级代码
a_dict.update({
cell_data:cell_id}) #省级名称与代码保存入a_dict字典文件中
a_list.append(cell_data) #省级名称保存入a_list列表文件中
def xFunc(event):
#print(com.get()) # #获取选中的值方法1
#print(xVariable.get()) # #获取选中的值方法2
a_dict_str=a_dict[com.get()]
a_dict_str=str(a_dict_str)[0:2]
print(a_dict_str)
# com2.delete(0,ttk.END) #本意想清空COM列表中所有数据,但不能实现不知道为什么?????
for xx in range(1, max_row2): #读取表2地级市代码表格内容
cell_data2 = ws2.cell(row=xx, column=1).value
cell_id2 = ws2.cell(row=xx, column=3).value
cell_id2_str=str(cell_id2)[0:2] #进行字符串截取得到在COM列表框中选择省级,与地市级进行关联
if cell_id2_str == a_dict_str:
b_dict.update({
cell_data2: cell_id2})
b_list.append(cell_data2)
com2["value"] = (b_list)
com2.current(1) #设置默认显示数据为第二条
def dierFunc(event):
print(com.get())
print(a_dict[com.get()])
print(com2.get())
print(b_dict[com2.get()])
global shengjidaima,shijidaima
shijidaima=b_dict[com2.get()]
shengjidaima=a_dict[com.get()]
def huoqudaima():
print(shengjidaima,shijidaima)
# 读取行政区划码结束。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
def download_song():
# 打开查询页面,得到页面总数....................................................
http_ip = [
'221.122.91.64:80',
'163.125.65.90:9797',
'58.251.230.5:9797',
'118.69.50.154:80',
'203.202.245.62:80',
'60.191.11.251:3128',
'54.241.121.74:3128',
'58.220.95.54:9400',
'52.179.231.206:80',
'3.22.74.49:3128',
'52.179.18.244:8080'
]
proxy_ip = {
'http': random.choice(http_ip), # choice() 方法返回一个列表,元组或字符串的随机项。
}
print(proxy_ip)
datas = {
"page.pageNo": "1",
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
#r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
r = requests.post(url, data=datas,proxies=proxy_ip)
html = etree.HTML(r.text)
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
urlpageidstr = urlpage[21:23] # 截取字符串,得到总页数
print(urlpage)
print(urlpageidstr)
# 开始爬取所有页面数据
start_page = 1
# pagesum = urlpageidstr
urlpageidstr = (int(urlpageidstr))
# urlpageidstr = 2 # 调试数据暂定为2页,提高效率
datasum = 1 # 记录爬取数据个数
pagesum2 = urlpageidstr * 10
messagebox.showinfo("提示", "数据正在读取请稍候。。。。")
filepath = 'paiwuxuke.xlsx' # 打开已有excel数据文件
wb = openpyxl.load_workbook(filepath)
ws = wb['Sheet']
# 表总行数
max_row = ws.max_row
# 表总列数
max_col = ws.max_column
global exitbiaozhi
enter1['state'] = 'readonly' #输入延时输入框变不可修改数据
for page in range(start_page, urlpageidstr):
content = page
# 打开查询页面,得到每个具体企业信息的链接....................................................
urlpage = content
proxy_ip = {
'http': random.choice(http_ip), # choice() 方法返回一个列表,元组或字符串的随机项。
}
print(proxy_ip)
datas = {
"page.pageNo": urlpage,
"page.orderBy": "",
"page.order": "",
"province": shengjidaima,
"city": shijidaima,
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
#r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
r = requests.post(url, data=datas,proxies=proxy_ip)
html = etree.HTML(r.text)
href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
href_name = html.xpath('//table[@class="tabtd"]/tr')[1:]
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
# urlpageidstr = urlpage[21:23] # 截取字符串,得到总页数
# print(urlpageidstr)
i = 0
# 打开查询页面,得到第N个企业的详细 信息....................................................
for href_url, roos in zip(href_url, href_name):
proxy_ip = {
'http': random.choice(http_ip), # choice() 方法返回一个列表,元组或字符串的随机项。
}
print(proxy_ip)
addurl = href_url
name = roos.xpath('./td[4]/text()')[0]
i = i + 1
addurl = addurl[39:93]
datas = {
"xkgk": "getxxgkContent",
"dataid": addurl}
url = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=" + addurl
html = requests.get(url, headers=datas,proxies=proxy_ip)
soup = BeautifulSoup(html.text, 'lxml')
name_id = soup.find_all('p', style="font-size:36px;")[0].text # 得到企业名称
name_add = soup.find_all('p', style="font-weight: bold;color: green;font-size: 14px;")[
0].text # 得到企业地址等信息 ..strip() 属性删除空格
content = name_add
content = content.strip() # 删除字符串左边空格
content = content.split() # 拆分字符串,通过指定分隔符对字符串进行分割,默认是空格。rstrip("\xa0\xa0\xa0\xa0\r\n\t\t\t")
# content=content.partition(":")
str2 = ''.join(content)
u1, u2, u3, u4, u5 = str2.split(':', 4)
f1 = u2.find('行业类别')
f2 = u2[0: