'''
根据输入的 地区编号、开始月日、多少个月、年份,查出许可证清单并保存
'''
import time, re, sys, random, requests, warnings, math, os, json, lxml
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd
from PIL import Image
from hashlib import md5
from fake_useragent import UserAgent
from configparser import ConfigParser
warnings.filterwarnings('ignore', category=Warning) # 设置 不警告
class XukezhenDown(object):
def __init__(self, username, password, soft_id, proxy):
self.new_end_date = None
self.total = None
self.browser = None
self.num_list = None
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.proxy = proxy
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
def browse(self, proxy):
s = Service('c:/chromedriver/chromedriver.exe')
chrome_options = Options()
chrome_options.add_argument('--proxy-sever=' + proxy)
# chrome_options.add_argument('--headless')
self.browser = webdriver.Chrome(service=s, chrome_options=chrome_options)
url_first = 'https://data.gdcic.net/Dop/Open/ConsPermitList.aspx'
self.browser.get(url=url_first)
def input_search(self, area_number,begin_date,end_date,m):
time.sleep(0.3)
# 取得验证码位置、大小信息
code_location = self.browser.find_element(By.ID, 'CheckCodeImage').location
code_size = self.browser.find_element(By.ID, 'CheckCodeImage').size
rangle = (
int(m * code_location['x']), int(m * code_location['y']), int(m * code_location['x']
+ m * code_size['width']),int(m * code_location['y'] + m * code_size['height']))
# 截整个屏
self.browser.save_screenshot('screen.png')
# 裁剪验证码图片并保存
Image.open('screen.png').crop(rangle).save('code.png')
im = open('code.png', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
yanzhenma = xukezhen_down.PostPic(im, 1005)['pic_str'] # 用超级鹰识别验证码返回 code
time.sleep(2)
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtDtStart').clear()
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtDtStart').send_keys(begin_date)
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtDtEnd').clear()
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtDtEnd').send_keys(end_date)
self.browser.find_element(By.XPATH,
'//*[@id="ctl00_ContentPlaceHolder1_ddlCity"]/option[' + str(area_number) + ']').click()
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtCheckCode').clear()
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_txtCheckCode').send_keys(yanzhenma)
self.browser.find_element(By.ID, 'ctl00_ContentPlaceHolder1_btnSearch').click()
# 获得项目总数并提示,如果没有则退出程序
def get_total(self, area_number):
# 第一页没数据,终止程序。获得总共项目数,大于300则提示未下载数
print('From: ', begin_date, ' to: ', end_date, end=' ')
time.sleep(0.2)
first_td = self.browser.find_elements(By.TAG_NAME, 'tr')
if len(first_td) == 1:
self.total = len(first_td) - 1
print('--------This date no data,please change time--------')
else:
try:
search_text = self.browser.find_element(By.XPATH,
'//*[@id="ctl00_ContentPlaceHolder1_AspNetPager1"]/div[1]').text
self.total = int(re.findall(r'\d+', search_text)[0])
except:
self.total = len(first_td) - 1
if self.total > 300:
print(' Find:', ad[area_number - 2], self.total, '个项目,正下载...300...个,还有:', self.total - 300,
'未下载,请保存完后改再下载前面时间部分!')
else:
print(' Find:', ad[area_number - 2], self.total, '个项目')
return self.total
# 获取当页许可证编号和省级项目编号,组成列表,添加到参数列表中
def get_page_number(self, project_number_list):
td = self.browser.find_elements(By.TAG_NAME, 'tr')
for i in td[1:]:
project_number_list.append([i.text.split(' ')[0], i.text.split(' ')[2]])
# 定义获取当前屏起始页所有编号集函数,传入页数,将编号返回列表中number_list
def get_all_number(self, project_number_list):
self.get_page_number(project_number_list)
print(' Find pages : -1', end='')
if self.total > 15:
for i in range(2, math.ceil(self.total / 15) + 1):
if i < 21:
self.browser.execute_script(
"javascript:__doPostBack('ctl00$ContentPlaceHolder1$AspNetPager1',{})".format(str(i)))
time.sleep(0.2 + random.random())
self.get_page_number(project_number_list)
print('-', i, end='')
else:
break
# 根据许可证编号下载项目信息
def down_data(self, project_number_list):
all_data = []
print('\n', 'Downing ok :', end='')
for no in project_number_list:
url = 'https://data.gdcic.net/Dop/Open/ConsPermitInfo.aspx?PrjCode=' + no[1] + '&PermitNo=' + no[0]
res = requests.request('get', url=url, headers=header, verify=False, data=datas, proxies=request_proxy)
soup = BeautifulSoup(res.text, 'lxml')
project_data = soup.find_all('h5') # 取到一个项目数据,所有h5标签
project_title = soup.find('div', class_='spec-title').text.strip() # 取到项目名
if project_title > '':
data = [project_title]
for h in project_data:
data.append(h.text.strip())
all_data.append(data)
time.sleep(0.2 + random.random())
if project_number_list.index(no) / 15 == int(project_number_list.index(no) / 15):
print('>',int(project_number_list.index(no) / 15)+1, end='')
return all_data
def save_as(self, area_number, all_data):
# 保存
col = ['项目名称', '许可证编号', '发证机关', '发证日期', '状态', '建设地址', '建设性质', '结构体系', '合同价格', '施工面积',
'开工日期', '合同竣工日期', '建设规模', '建设单位', '项目负责人', '总承包单位', '承包项目经理', '勘察单位', '勘察负责人',
'设计单位', '设计负责人', '施工单位', '项目负责人', '监理单位', '监理总工', '用地编号', '规划编号', '中标书编号',
'审查合格书编号', '合同编号']
df = pd.DataFrame(all_data, columns=col)
if self.total > 300:
file_end_date = df['发证日期'].max() + '前面' + str(self.total - 300) + '个未下'
else:
file_end_date = df['发证日期'].max()
global begin_date
begin_date = df['发证日期'].min()
if not os.path.isdir('xukezhen'):
os.mkdir('xukezhen')
df.to_excel('./xukezhen/' + ad[area_number - 2] + df['发证日期'].min() + '到' + file_end_date + '.xlsx')
global end_date
end_date = df['发证日期'].min()
print('\n', ' Current saved , OK')
if __name__ == '__main__':
begin_time = time.ctime()
print('Begin time: ',begin_time)
# 配置proxy,requests,chrome
proxies = [{'https': '113.196.85.74:3128'}]
request_proxy = None
header = {'User-Agent': UserAgent().random}
datas = {'Host': 'data.gdcic.net',
'Connection': 'close',
'Referer': 'https://data.gdcic.net/Dop/Open/ConsPermitList.aspx'}
ad = ['02广州', '03韶关', '04深圳', '05珠海', '06汕头', '07佛山', '08江门', '09湛江', '10茂名', '11肇庆',
'12惠州', '13梅州', '14汕尾', '15河源', '16阳江', '17清远', '18东莞', '19中山', '20潮州', '21揭阳',
'22云浮', '23顺德']
# 读取配置文件里的 查询 参数
data_ini = '广东许可证.ini'
cfg = ConfigParser()
cfg.read(data_ini, encoding='utf-8')
cfg_dict = dict(cfg.items('time_set'))
user = cfg_dict.get('user')
chao_pw = cfg_dict.get('chao_pw')
soft_id = cfg_dict.get('soft_id')
m = int(cfg_dict.get('m')) # 屏幕放大比例
area_num_list = json.loads(cfg_dict.get('area_number_list'))
try:
time.strptime(cfg_dict.get('begin_date'), '%Y-%m-%d')
time.strptime(cfg_dict.get('end_date'), '%Y-%m-%d')
except:
print('日期格式错误,请修改后再运行!')
sys.exit()
chrome_proxy = list(proxies[0].keys())[0] + proxies[0][list(proxies[0].keys())[0]]
# 实例化,打开浏览器,等待搜索 # 用户中心>>软件ID 生成一个替换 96001
xukezhen_down = XukezhenDown(user, chao_pw, soft_id, chrome_proxy)
xukezhen_down.browse(chrome_proxy)
# 实例化,开始搜索类
for area_number in area_num_list:
begin_date = cfg_dict.get('begin_date')
end_date = cfg_dict.get('end_date')
while time.strptime(end_date,'%Y-%m-%d') > time.strptime(begin_date,'%Y-%m-%d'):
xukezhen_down.input_search(area_number,begin_date, end_date,m) # 输入日期/验证码/地区等开始搜索
try:
wait_time = 0.2
while True:
xukezhen_down.browser.switch_to.alert.accept()
print('Code is not right, find again,wait time:', wait_time)
time.sleep(wait_time)
xukezhen_down.input_search(area_number,begin_date, end_date,m)
wait_time = wait_time * 2
if wait_time > 50:
print('Too long time, exit!')
xukezhen_down.browser.quit()
sys.exit()
except:
print('** Code is right! Begin searching **')
xukezhen_down.get_total(area_number)
if xukezhen_down.total > 0:
all_project_number = []
xukezhen_down.get_all_number(all_project_number)
xukezhen_down.save_as(area_number, xukezhen_down.down_data(all_project_number))
else:
begin_date = end_date
try:
xukezhen_down.browser.quit()
except:
pass
print('Total time: ', begin_time, '------', time.ctime())
print('************ All over! ************')
###############################################################
[time_set]
user = f
chao_pw =h
soft_id = 908144
m = 1
begin_date = 2022-1-01
end_date = 2022-09-10
地区代号列表
area_number_list = [6,11]
####################################################
#ad = [‘02广州’,‘03韶关’,‘04深圳’,‘05珠海’,
‘06汕头’,‘07佛山’,‘08江门’,‘09湛江’,‘10茂名’,
‘11肇庆’,‘12惠州’,‘13梅州’,‘14汕尾’,‘15河源’,
‘16阳江’,‘17清远’,‘18东莞’,‘19中山’,‘20潮州’,
‘21揭阳’,‘22云浮’,‘23顺德’]
import pandas as pd
import os,time
输入城市代号
num = 15
ad = [‘02广州’,‘03韶关’,‘04深圳’,‘05珠海’,‘06汕头’,‘07佛山’,‘08江门’,‘09湛江’,‘10茂名’,‘11肇庆’,
‘12惠州’,‘13梅州’,‘14汕尾’,‘15河源’,‘16阳江’,‘17清远’,‘18东莞’,‘19中山’,‘20潮州’,‘21揭阳’,
‘22云浮’,‘23顺德’]
adr = ad[num -2][-2:]
文件列表,找出地区(二个字)的所有文件,列表
os.chdir(‘xukezhen’)
all_adr = os.listdir()
adr_list =[]
for f in all_adr:
a =f[2:4]
if f[2:4] == adr:
adr_list.append(f)
if len(adr_list) > 1:
df_list = []
for ad in adr_list:
d = pd.read_excel(ad)
df_list.append(d)
df = pd.concat(df_list,ignore_index=True)
df.drop(columns='Unnamed: 0',axis=1,inplace=True) # 删除第一列原index
df.duplicated() # 标记重复行
df.drop_duplicates(inplace=True) # 删除重复行
df = df.dropna(subset=['项目名称']) # 删除空值行
df.sort_values('发证日期',inplace=True,ascending=False) # 排序
df.to_excel(adr + time.ctime()[-4:] + time.ctime()[4:7] + time.ctime()[9:11] + '.xlsx')
print('OK')
else:
print(‘only one or 0 file, not to concat’)
#################################################################