from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
from urllib import request, parse
import json
# 指定ChromeDriver的路径
driver_path='E://python//chromedriver.exe'
service = Service(driver_path = driver_path)
# 使用ChromeOptions初始化webdriver
driver = webdriver.Chrome(service=service)
# 打开页面
driver.get('https://XXX.html')
# 抓取数据
#data = driver.page_source
# 打印抓取的数据
#print(data)
time.sleep(10)
pageSize=500
# search (京)JZ安许证字[2024] //2020 2021 2022 2023 2024
search_time = [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024]
print("--------------------search_time:")
print(*search_time,sep=',')
search_name = ['京','津','冀','晋','蒙','辽','吉','黑','沪','苏','浙',
'皖','闽','赣','鲁','豫','鄂','湘','粤','桂','琼','渝',
'川','黔','云','藏','陕','甘','青','宁', '新', '兵团']
print("--------------------search_name:")
print(*search_name,sep=',')
search_province = ['110000','120000','130000','140000','150000','210000','220000','230000','310000','320000','330000',
'340000','350000','360000','370000','410000','420000','430000','440000','450000','460000','500000',
'510000','520000','530000','540000','610000','620000','630000','640000', '650000', '690000']
def get_count(time,name,province):
# 要提交的表单数据
form_data = {"pageIndex":"0","pageSize":"10","search":"("+name+")JZ安许证字["+str(time)+"]","totalSearch":"{\"qymc\":\"\",\"tyshxydm\":\"\",\"scxkzsbh\":\"\",\"province\":\""+province+"\",\"city\":\"\",\"district\":\"\",\"sfzk\":\"展开\",\"type\":\"\"}","searchtype":"fwmh"}
# 将表单数据编码为一个字符串
data = parse.urlencode({'params':form_data})
#data = data.encode('utf-8') # 将字符串转换为字节对象
# 要访问的URL
url = ' \"https://XXX" '
http_js='var xhr1 = new XMLHttpRequest(); xhr1.open("POST", "http://127.0.0.1:8008/get_count", true);var jdata=JSON.parse(xhr.responseText);jdata["name"]=\"' + name + '\";jdata["time"]=\"' + str(time) + '\"; xhr1.send(JSON.stringify(jdata));'
# 通过JavaScript执行POST请求
js_post = 'var xhr = new XMLHttpRequest(); xhr.open("POST", ' + url + ', true); xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); xhr.onreadystatechange=function(){ if(xhr.readyState==4&&xhr.status==200) {console.log(xhr.responseText);'+ http_js +'} }; xhr.send(\"' + data + '\");'
result=driver.execute_script(js_post)
# 打印抓取的数据
print("get_count ok-------------"+ str(time)+"-"+name+"-"+province)
def get_data(i,time,name,province):
# 要提交的表单数据
form_data = {"pageIndex":str(i),"pageSize":str(pageSize),"search":"("+name+")JZ安许证字["+str(time)+"]","totalSearch":"{\"qymc\":\"\",\"tyshxydm\":\"\",\"scxkzsbh\":\"\",\"province\":\""+province+"\",\"city\":\"\",\"district\":\"\",\"sfzk\":\"展开\",\"type\":\"\"}","searchtype":"fwmh"}
# 将表单数据编码为一个字符串
data = parse.urlencode({'params':form_data})
#data = data.encode('utf-8') # 将字符串转换为字节对象
# 要访问的URL
url = ' \"https://XXX" '
http_js='var xhr1 = new XMLHttpRequest(); xhr1.open("POST", "http://127.0.0.1:8008/get_data", true);var jdata=JSON.parse(xhr.responseText);jdata["name"]=\"' + name + '\";jdata["time"]=\"' + str(time) + '\"; xhr1.send(JSON.stringify(jdata));'
# 通过JavaScript执行POST请求
js_post = 'var xhr = new XMLHttpRequest(); xhr.open("POST", ' + url + ', true); xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); xhr.onreadystatechange=function(){ if(xhr.readyState==4&&xhr.status==200) {console.log(xhr.responseText);'+ http_js +'} }; xhr.send(\"' + data + '\");'
result=driver.execute_script(js_post)
# 打印抓取的数据
print("get_data ok-------------"+ str(i)+"-"+str(time)+"-"+name+"-"+province)
#省的索引--------------------------------修改此处进行不同省份数据拉取【0-32】
user_input = input("请输入省会名索引[0-32]:")
#i=3
i=int(user_input)
for s_time in search_time:
get_count(s_time,search_name[i],search_province[i])
time.sleep(10)
# 打开JSON文件读取现有数据
with open('output.json', 'r') as file:
existing_data = json.load(file)
#省的索引不同时间的数据个数
for s_time in search_time:
name =search_name[i]+'-'+str(s_time)
j=existing_data[name]
for k in range(int(j/pageSize+1)):
get_data(k,s_time,search_name[i],search_province[i])
time.sleep(20)
'''
#省的索引不同时间的数据个数[0,20] 可以修改[2,20] 2007---2024年
for ii in range(8,20):
name =search_name[i]+'-'+str(search_time[ii])
j=existing_data[name]
for k in range(int(j/pageSize+1)):
get_data(k,search_time[ii],search_name[i],search_province[i])
time.sleep(20)
'''
'''
#查询第i位置的省份 ii位置年份的数据个数
i=1
ii=2016
get_count(ii,search_name[i],search_province[i])
time.sleep(10)
# 打开JSON文件读取现有数据
with open('output.json', 'r') as file:
existing_data = json.load(file)
#获取第i位置的省份 ii位置年份的数据个数
name =search_name[i]+'-'+str(ii)
j=existing_data[name]
for k in range(int(j/pageSize+1)):
get_data(k,ii,search_name[i],search_province[i])
time.sleep(20)
'''
# 清理:关闭浏览器窗口并退出WebDriver
driver.quit()
python通过chrome插件抓取网站数据
最新推荐文章于 2024-12-02 22:36:38 发布
1172

被折叠的 条评论
为什么被折叠?



