......上次帮同学写的“地理空间数据云的下载”报错了......
分析:因为点击过多会出现:not found 和 404 的错误。
所以:我有重新写了,来预防not found 和 404 的问题,并且还可以每次检查下当前页的文件是否全都下载。
【没有写成面向对象,直接写的函数式编程,但是改变也很简单,这边就不累赘了,可以自己手动改哇~】
#下面为本实例的爬虫代码,若有问题可以给我留言,或者有更好的解决方法也可以私信我~
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import random
import requests
import tesserocr
from PIL import Image
"""
def get_captcha(): #验证码识别能力太差
image=Image.open('captcha.jpg')
image = image.convert('L')
threshold=110
table=[]
for i in range(256):
if i<threshold:
table.append(0)
else:
table.append(1)
image=image.point(table,'1')
res=tesserocr.image_to_text(image).strip()
# 替换列表
rep = {'0': 'O',
'1': 'I',
'2': 'Z',
'8': 'S',
'b':'G',
'\\':'J',
}
for r in rep:
res = res.replace(r, rep[r])
print(res)
return res
"""
def login(driver): #登录页面
driver.get('https://www.gscloud.cn/accounts/login')
email = driver.find_element_by_xpath('//*[@id="userid"]')
email.send_keys('932791635@qq.com')
password = driver.find_element_by_xpath('//*[@id="password"]')
password.send_keys('932biewang1')
captcha = driver.find_element_by_xpath('//*[@id="id_captcha_1"]')
"""
#下载验证码图片
img_url = driver.find_element_by_xpath('//*[@id="login-form"]/div[3]/div[1]/img').get_attribute('src')
print(img_url)
r = requests.get(img_url)
with open('captcha.jpg', 'wb')as f:
f.write(r.content)
#解析验证码图片
captcha_sj = get_captcha()
"""
captcha_sj = input('请输入验证码:').strip() #
captcha.send_keys(captcha_sj)
return driver
def go_to_demo30(driver): #到达下载页面 'https://www.gscloud.cn/sources/list_dataset/421?cdataid=302&pdataid=10&datatype=gdem_utm2#dlv=Wzg4LFs0MCwxMCwyMjYxLDIyNjAzXSxbWyJkYXRhaWQiLDFdXSxbXSw5OV0%3D'
driver.find_element_by_xpath('//*[@id="login-form"]/input[3]').click()
time.sleep(3)
driver.find_element_by_xpath('/html/body/div[3]/div[3]/div[5]/a/h4').click()
time.sleep(3)
driver.find_element_by_xpath('//*[@id="dataset-listview"]/div/div/ul/li[4]/div/a[3]').click()
time.sleep(3)
return driver
def download(driver,start_page): #下载这一页的数据
page_sr = driver.find_element_by_xpath('//*[@id="pager1"]/div[2]/table/tbody/tr/td[7]/input')
page_sr.clear()
time.sleep(1)
page_sr.send_keys(start_page)
page_sr.send_keys(Keys.RETURN)
time.sleep(3)
print('当前下载第{}页'.format(start_page))
for tr_num in range(3, 13): # 只能取到3-12
d_everypage = '//*[@id="all_datasets_listview"]/div/table/tbody/tr[' + str(tr_num) + ']/td[9]/div/div/a[2]'
dataid = driver.find_element_by_xpath(d_everypage).get_attribute('dataid')
data_path = r'{}.zip'.format(dataid)
if not os.path.exists(data_path):
click_url = 'https://www.gscloud.cn/sources/download/421/{}/bj'.format(dataid)
driver.get(click_url)
time.sleep(5)
while driver.page_source == '<html xmlns="http://www.w3.org/1999/xhtml"><head></head><body>Not Found</body></html>': # 如果页面返回的是Not Found,那就再点击该下载按钮
time.sleep(random.randint(1, 5))
driver.get(click_url)
print('下载链接:{},名称:{},下载地址{}'.format(click_url, dataid, data_path))
time.sleep(random.randint(20, 30))
break
print('{}数据下载成功'.format(dataid))
time.sleep(random.randint(20, 30)) # 每个下载时间给20秒
driver=driver.get('https://www.gscloud.cn/sources/list_dataset/421?cdataid=302&pdataid=10&datatype=gdem_utm2#dlv=Wzg4LFs0MCwxMCwyMjYxLDIyNjAzXSxbWyJkYXRhaWQiLDFdXSxbXSw5OV0%3D')
download(driver,start_page=1)
time.sleep(3)
else:
print('{}数据已经存在'.format(dataid))
return driver
def download_info(driver):
start_page=1 #若是程序终端,可以重新从该页出现
page_num=800
while start_page <= page_num:
try:
driver=download(driver,start_page) #下载这一页的数据
start_page += 1
except Exception as e:
print(e)
continue
def main():
driver = webdriver.Chrome(executable_path='C:\\Users\Administrator\Anaconda3\Scripts\chromedriver.exe')
driver=login(driver)
driver=go_to_demo30(driver)
download_info(driver)
main() #此时可以防止not found 和 404 的问题,并且还会每次检查每页的数据是否下载齐全!【泛化能力更强!】
---------(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)(。・ω・。)----------
今日爬虫完成!
今日鸡汤:每一份坚持都是成功的累积,相信自己,总会遇到惊喜,肯定自己,不要轻言放弃;每一个清晨都是希望的开始,鼓励自己,展现自信,越努力越幸运。
加油ヾ(◍°∇°◍)ノ゙