爬虫：IAU官网的88个星座的gif、txt

最新推荐文章于 2025-05-17 21:03:43 发布

炼丹的蜗牛@/"

最新推荐文章于 2025-05-17 21:03:43 发布

阅读量366

点赞数 1

分类专栏：网络爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_43826681/article/details/115430251

版权

网络爬虫专栏收录该内容

2 篇文章

订阅专栏

本文展示了如何使用Python爬虫从IAU官网抓取星座图片链接和相关信息，包括识别并下载了星座名称和边界链接，提供了关于天文学知识的获取途径。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、爬取网址

https://www.iau.org/public/themes/constellations/

二、代码

# 导入模块
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import urllib
import time

# 网页地址
url = "https://www.iau.org/public/themes/constellations/"

# 打开浏览器
# driver = webdriver.Chrome(r"C:\Users\53224\_jupyter\chromedriver.exe")

# 打开网页
# driver.get(url)

# 请求网页
r = requests.get(url)

# 获取网页源代码
soup = BeautifulSoup(r.text)

img_a_list = soup.select('strong > a')

print(len(img_a_list))
for a in img_a_list:
    print(a)

img_url_list = []
for a in img_a_list:
    # print(a['href'])
    print("https://www.iau.org"+a['href'])
    img_url_list.append("https://www.iau.org"+a['href'])

# print(img_url_list)
print(len(img_url_list))
for img_url in img_url_list:
    print(img_url)

headers = {"User-Agent": "Mozilla/5.0"}
BASE_PATH = r'./Constellation charts/'

for img_url in img_url_list:
    img_name = img_url[img_url.rfind('/')+1:]
    img_path = BASE_PATH + img_name
    
    request = urllib.request.Request(url=img_url, headers=headers)
    img = urllib.request.urlopen(request).read()

    f = open(img_path, "wb")
    f.write(img)
    f.close()
    print(img_name, 'done.')
    
    time.sleep(2)

a_list = soup.select('tr > td > p > a')
boundary_a_list = []
for a in a_list:
    if a.text=='TXT':
        # print(a)
        boundary_a_list.append(a)
        
print(len(boundary_a_list))
for a in boundary_a_list:
    print(a)

boundary_url_list = []

for a in boundary_a_list:
    # print("https://www.iau.org"+a['href'])
    boundary_url_list.append("https://www.iau.org"+a['href'])
    
print(len(boundary_url_list))
for url in boundary_url_list:
    print(url)

headers = {"User-Agent": "Mozilla/5.0"}
BASE_PATH = r'./Constellation boundaries/'

for txt_url in boundary_url_list:
    txt_name = txt_url[txt_url.rfind('/')+1:]
    txt_path = BASE_PATH + txt_name
    
    request = urllib.request.Request(url=txt_url, headers=headers)
    txt = urllib.request.urlopen(request).read()
    
    f = open(txt_path, "wb")
    f.write(txt)
    f.close()
    print(txt_name, 'done.')
    
    time.sleep(2)