一、爬取网址
https://www.iau.org/public/themes/constellations/
二、代码
# 导入模块
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import urllib
import time
# 网页地址
url = "https://www.iau.org/public/themes/constellations/"
# 打开浏览器
# driver = webdriver.Chrome(r"C:\Users\53224\_jupyter\chromedriver.exe")
# 打开网页
# driver.get(url)
# 请求网页
r = requests.get(url)
# 获取网页源代码
soup = BeautifulSoup(r.text)
img_a_list = soup.select('strong > a')
print(len(img_a_list))
for a in img_a_list:
print(a)
img_url_list = []
for a in img_a_list:
# print(a['href'])
print("https://www.iau.org"+a['href'])
img_url_list.append("https://www.iau.org"+a['href'])
# print(img_url_list)
print(len(img_url_list))
for img_url in img_url_list:
print(img_url)
headers = {"User-Agent": "Mozilla/5.0"}
BASE_PATH = r'./Constellation charts/'
for img_url in img_url_list:
img_name = img_url[img_url.rfind('/')+1:]
img_path = BASE_PATH + img_name
request = urllib.request.Request(url=img_url, headers=headers)
img = urllib.request.urlopen(request).read()
f = open(img_path, "wb")
f.write(img)
f.close()
print(img_name, 'done.')
time.sleep(2)
a_list = soup.select('tr > td > p > a')
boundary_a_list = []
for a in a_list:
if a.text=='TXT':
# print(a)
boundary_a_list.append(a)
print(len(boundary_a_list))
for a in boundary_a_list:
print(a)
boundary_url_list = []
for a in boundary_a_list:
# print("https://www.iau.org"+a['href'])
boundary_url_list.append("https://www.iau.org"+a['href'])
print(len(boundary_url_list))
for url in boundary_url_list:
print(url)
headers = {"User-Agent": "Mozilla/5.0"}
BASE_PATH = r'./Constellation boundaries/'
for txt_url in boundary_url_list:
txt_name = txt_url[txt_url.rfind('/')+1:]
txt_path = BASE_PATH + txt_name
request = urllib.request.Request(url=txt_url, headers=headers)
txt = urllib.request.urlopen(request).read()
f = open(txt_path, "wb")
f.write(txt)
f.close()
print(txt_name, 'done.')
time.sleep(2)
三、爬取结果