import os
import urllib.parse
import numpy as np
from utils import Url_manager
import requests
from bs4 import BeautifulSoup
import re
import requests
import io
from astroquery.skyview import SkyView
from astropy.io import fits
import astropy.units as u
import matplotlib.pyplot as plt
import astropy.io.fits as fits
from astropy.coordinates import SkyCoord
from astropy.coordinates import Angle
from astropy.units import deg
import time
from astroquery.skyview import SkyView
SkyView.ROW_LIMIT = -1 # 确保没有行数限制
# 设置全局请求超时时间
global_timeout = 10 # 单位:秒
root_url = "https://"
# 全局设置requests库的超时时间
requests.adapters.DEFAULT_RETRIES = 3
requests.adapters.DEFAULT_TIMEOUT = global_timeout
urls = Url_manager.UrlManager()
urls.add_new_url(root_url)
#写入txt记事本
#fout = open("OK.txt", "w")
#current_fits_file =None
while urls.has_new_url():
curr_url = urls.get_url()
# 延迟一段时间再发起新的请求
time.sleep(2) # 示例:每两次请求之间间隔2秒
# print("curr_url:"+curr_url)
try:
r = requests.get(curr_url)
if r.status_code != 200:
print("error,return status_code is not 200", curr_url)
continue
soup = BeautifulSoup(r.text, "lxml")
links = soup.find_all("a")
current_link_index = -1
for link_index,link in enumerate(links):
href = link.get("href")
# 检查href是否为None
if href is None:
continue
newnew_url = curr_url + href
# print("newnew_url:"+newnew_url)
newnewnew_url= str(newnew_url)
#print("newnewnew_url"+newnewnew_url)
pattern2=r'https://20\d{2}/\d{2}/\d{8}/[a-zA-Z0-9]+/{1}$'
if re.match(pattern2, newnew_url):
if(len(newnewnew_url)>55):
newnewnew_url_ = newnewnew_url[8:]
# print(newnewnew_url_)
url_ = newnewnew_url_.split("/")[6]
# print("url_:" + url_)
final_observe_path = os.path.join("E:\\lt240302_png", url_)
# print("final_fianl_final_final_observe1:" + final_fianl_final_final_observe1)
# print("final_observe_path:" + final_observe_path)
if os.path.exists(final_observe_path):
print(final_observe_path+"母爬取中已存在,跳过此次处理.")
continue
pattern1 = r'https://20\d{2}/\d{2}/\d{8}/[a-zA-Z0-9]+/.+s$'
if re.match(pattern1, newnew_url):
response = requests.get(newnew_url)
if response.status_code == 200:
with io.BytesIO(response.content) as fileobj:
current_fits_file = fits.open(fileobj, ignore_missing_simple=True)
header = current_fits_file[0].header
# 判断目标文件夹是否已存在
url_ = newnew_url[29:]
url__ = url_[1:]
# print("url__:"+url__)
final_fianl_final_final_observe1 = url__.split('/')[5]
final_observe_path = os.path.join("E:\\lt240302_png", final_fianl_final_final_observe1)
# print("final_fianl_final_final_observe1:" + final_fianl_final_final_observe1)
# print("final_observe_path:" + final_observe_path)
if os.path.exists(final_observe_path):
print(final_observe_path+"子爬取中已存在,跳过此次处理.")
break# 目标文件夹已存在,跳过此次循环
else:
try:
ra_str = header['OBJCTRA'] if 'OBJCTRA' in header else header['RA_OBJ']
except KeyError:
print("这个目标的赤经赤纬找不到")
workpath = "E:\\lt240302_png"
final_observe_path = os.path.join(workpath, final_fianl_final_final_observe1)
# final_observe_path = os.path.join(workpath, final_observe_decoded)
os.makedirs(final_observe_path, exist_ok=True)
print(final_fianl_final_final_observe1 + "已经创建空的文件夹")
continue
try:
dec_str = header['OBJCTDEC'] if 'OBJCTDEC' in header else header['DEC_OBJ']
except KeyError:
print("这个目标的赤经赤纬找不到")
final_observe_path = os.path.join(workpath, final_fianl_final_final_observe1)
# final_observe_path = os.path.join(workpath, final_observe_decoded)
os.makedirs(final_observe_path, exist_ok=True)
print(final_fianl_final_final_observe1 + "已经创建空的文件夹")
continue
# 将字符串转换为天球坐标
try:
c = SkyCoord(ra=ra_str, dec=dec_str, unit=(u.hourangle, u.deg))
ra_degrees = c.ra.degree
dec_degrees = c.dec.degree
except KeyError:
print("无法在FITS头信息中找到赤经或赤纬数据")
# 打印赤经和赤纬(十进制度数格式)
print(f"赤经: {ra_degrees:.6f}°")
print(f"赤纬: {dec_degrees:.6f}°")
ra = Angle(ra_degrees, unit=deg)
dec = Angle(dec_degrees, unit=deg)
url_ = newnew_url[29:]
url__ = url_[1:]
final_fianl_final_final_observe = url__.split('/')[5]
print("url_:" + url_)
print("url__:" + url__)
print("final_fianl_final_final_observe:" + final_fianl_final_final_observe)
observe_part = newnew_url.split('/')[-1]
print("observe_part:" + observe_part)
observe_part_unqoute = urllib.parse.unquote(observe_part)
fits_observe_part_unqoute = observe_part_unqoute.replace(".fts", ".fits")
print("observe_part_unqoute" + observe_part_unqoute)
final_observe_decoded = observe_part_unqoute.split('_')[0]
print("final_observe_decoded" + final_observe_decoded)
workpath = "E:\\lt240302_png"
final_observe_path = os.path.join(workpath, final_fianl_final_final_observe)
# final_observe_path = os.path.join(workpath, final_observe_decoded)
os.makedirs(final_observe_path, exist_ok=True)
local_dss_save_path = os.path.join(final_observe_path, fits_observe_part_unqoute)
png_save_path = local_dss_save_path.replace(".fits", ".png")
# # 创建一个 SkyCoord 对象
target_coord = SkyCoord(ra=ra, dec=dec, frame='icrs')
workpath1 = "E:\\lt240302_fits"
workpath = "E:\\lt240302_png"
max_retries = 3
some_backoff_factor = 2 # 设定初始退避时间为2秒
for retry in range(max_retries + 1):
try:
DSS = SkyView.get_images(position=target_coord, survey='DSS', radius=15 * u.arcmin,
)
break
except Exception as e:
if retry == max_retries:
print("尝试获取DSS图像多次失败,超时退出。")
break
else:
timeme = (retry + 1) * some_backoff_factor
print(f"获取DSS图像超时,将在"+str(timeme)+"秒后重试...")
time.sleep(timeme)
#DSS = SkyView.get_images(position=target_coord, survey='DSS', radius=15 * u.arcmin,timeout =global_timeout)
if len(DSS) >0:
DSS[0].writeto(workpath1 + '\\' + fits_observe_part_unqoute, overwrite=True)
# 输出名字修改成目标名字
DSS = fits.open(workpath1 + '\\' + fits_observe_part_unqoute)
plt.imsave(png_save_path, DSS[0].data, cmap='gray', origin='lower') # 输出名字修改成目标名字
print("success:%s\n" % url_)
DSS.close()
print("succes1")
else:
print("No DSS data found for the given coordinates at {newnew_url}. Skipping to next URL.")
continue
if href is None:
continue
#print("success5555")
pattern3 = r'20\d+/|\d{2}/|\d{8}/|[a-zA-Z0-9]+/|.+s$'
pattern4 = r'fts$'
#print("------------------------------------------")
#print("href"+href)
#print("------------------------------------------")
if re.match(pattern3, href):
url = curr_url + href
#print("success2")
if re.match(pattern4,href):
#print("success3")
# print("...........................................")
print("curr_url:"+curr_url)
s= str(curr_url)
s1 = s[8:]
print("s1:"+s1)
try:
s2 = s1.split('/')[6]
except KeyError:
continue
print("url:" + url)
print("href+"+href)
str_href = str(href)
href1 = str_href.split('/')[0]
print("href1:"+href1)
href2 = href1[:-4]
print("href2:" + href2)
href3 = href2+".fits"
print("href3:"+href3)
final_observe_path = os.path.join("E:\\lt240302_png", s2)
print("final_observe_path:" + final_observe_path)
if os.path.exists(final_observe_path):
print(final_observe_path + "子爬取中已经爬取,就不继续往下爬了")
# 获取当前链接所在索引之后的第一个链接(如果存在)
if link_index < len(links) - 1:
next_link = links[link_index + 1]
next_href = next_link.get("href")
print("curr_url"+curr_url)
print("next_href:"+next_href)
next_url = urllib.parse.urljoin(curr_url, next_href)
urls.add_new_url(next_url)
print("现在放" + next_url + "进入管理器")
# 处理完一个链接后,更新当前处理链接的索引
current_link_index = link_index
break
continue
urls.add_new_url(url)
except requests.exceptions.Timeout:
print(f"请求超时: {curr_url}")
continue
最终代码呈现