python通过chrome插件抓取网站数据

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

import time
from urllib import request, parse
import json

# 指定ChromeDriver的路径
driver_path='E://python//chromedriver.exe'

service = Service(driver_path = driver_path)
# 使用ChromeOptions初始化webdriver
driver = webdriver.Chrome(service=service)

# 打开页面
driver.get('https://XXX.html')
# 抓取数据
#data = driver.page_source
# 打印抓取的数据
#print(data)
time.sleep(10)


pageSize=500
# search (京)JZ安许证字[2024]  //2020 2021 2022 2023 2024
search_time = [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024] 
print("--------------------search_time:")
print(*search_time,sep=',')

search_name = ['京','津','冀','晋','蒙','辽','吉','黑','沪','苏','浙',
               '皖','闽','赣','鲁','豫','鄂','湘','粤','桂','琼','渝',
               '川','黔','云','藏','陕','甘','青','宁', '新', '兵团']
print("--------------------search_name:")
print(*search_name,sep=',')

search_province = ['110000','120000','130000','140000','150000','210000','220000','230000','310000','320000','330000',
               '340000','350000','360000','370000','410000','420000','430000','440000','450000','460000','500000',
               '510000','520000','530000','540000','610000','620000','630000','640000', '650000', '690000']

def get_count(time,name,province):
    # 要提交的表单数据
    form_data = {"pageIndex":"0","pageSize":"10","search":"("+name+")JZ安许证字["+str(time)+"]","totalSearch":"{\"qymc\":\"\",\"tyshxydm\":\"\",\"scxkzsbh\":\"\",\"province\":\""+province+"\",\"city\":\"\",\"district\":\"\",\"sfzk\":\"展开\",\"type\":\"\"}","searchtype":"fwmh"}
 
    # 将表单数据编码为一个字符串
    data = parse.urlencode({'params':form_data})
    #data = data.encode('utf-8')  # 将字符串转换为字节对象

    # 要访问的URL
    url = ' \"https://XXX" '

    http_js='var xhr1 = new XMLHttpRequest(); xhr1.open("POST", "http://127.0.0.1:8008/get_count", true);var jdata=JSON.parse(xhr.responseText);jdata["name"]=\"' + name + '\";jdata["time"]=\"' + str(time) + '\"; xhr1.send(JSON.stringify(jdata));'
    # 通过JavaScript执行POST请求
    js_post = 'var xhr = new XMLHttpRequest(); xhr.open("POST", ' + url + ', true); xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded");  xhr.onreadystatechange=function(){ if(xhr.readyState==4&&xhr.status==200) {console.log(xhr.responseText);'+ http_js +'} };  xhr.send(\"' + data + '\");'
    result=driver.execute_script(js_post)

    # 打印抓取的数据
    print("get_count ok-------------"+ str(time)+"-"+name+"-"+province)


def get_data(i,time,name,province):
    # 要提交的表单数据
    form_data = {"pageIndex":str(i),"pageSize":str(pageSize),"search":"("+name+")JZ安许证字["+str(time)+"]","totalSearch":"{\"qymc\":\"\",\"tyshxydm\":\"\",\"scxkzsbh\":\"\",\"province\":\""+province+"\",\"city\":\"\",\"district\":\"\",\"sfzk\":\"展开\",\"type\":\"\"}","searchtype":"fwmh"}
 
    # 将表单数据编码为一个字符串
    data = parse.urlencode({'params':form_data})
    #data = data.encode('utf-8')  # 将字符串转换为字节对象

    # 要访问的URL
    url = ' \"https://XXX" '

    http_js='var xhr1 = new XMLHttpRequest(); xhr1.open("POST", "http://127.0.0.1:8008/get_data", true);var jdata=JSON.parse(xhr.responseText);jdata["name"]=\"' + name + '\";jdata["time"]=\"' + str(time) + '\"; xhr1.send(JSON.stringify(jdata));'
    # 通过JavaScript执行POST请求
    js_post = 'var xhr = new XMLHttpRequest(); xhr.open("POST", ' + url + ', true); xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded");  xhr.onreadystatechange=function(){ if(xhr.readyState==4&&xhr.status==200) {console.log(xhr.responseText);'+ http_js +'} };  xhr.send(\"' + data + '\");'
    result=driver.execute_script(js_post)

    # 打印抓取的数据
    print("get_data ok-------------"+ str(i)+"-"+str(time)+"-"+name+"-"+province)


#省的索引--------------------------------修改此处进行不同省份数据拉取【0-32】
user_input = input("请输入省会名索引[0-32]:")
#i=3
i=int(user_input)


for s_time in search_time:
    get_count(s_time,search_name[i],search_province[i])
    time.sleep(10)


# 打开JSON文件读取现有数据
with open('output.json', 'r') as file:
    existing_data = json.load(file)


#省的索引不同时间的数据个数
for s_time in search_time:
    name =search_name[i]+'-'+str(s_time)
    j=existing_data[name]
    for k in range(int(j/pageSize+1)):
        get_data(k,s_time,search_name[i],search_province[i])
        time.sleep(20)  


'''

#省的索引不同时间的数据个数[0,20]  可以修改[2,20] 2007---2024年
for ii in range(8,20):
    name =search_name[i]+'-'+str(search_time[ii])
    j=existing_data[name]
    for k in range(int(j/pageSize+1)):
        get_data(k,search_time[ii],search_name[i],search_province[i])
        time.sleep(20) 

'''



'''
#查询第i位置的省份 ii位置年份的数据个数
i=1 
ii=2016
get_count(ii,search_name[i],search_province[i])
time.sleep(10)        

# 打开JSON文件读取现有数据
with open('output.json', 'r') as file:
    existing_data = json.load(file)

#获取第i位置的省份 ii位置年份的数据个数
name =search_name[i]+'-'+str(ii)
j=existing_data[name]
for k in range(int(j/pageSize+1)):
    get_data(k,ii,search_name[i],search_province[i])
    time.sleep(20)  
'''



        
# 清理:关闭浏览器窗口并退出WebDriver
driver.quit()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值