# v3.0进行了代码优化
# 前面两个版本download的是首页的小图片
# 本版本为高清大图
import requests
from bs4 import BeautifulSoup
import re
def getHTMLText(url): #获取页面
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def getList(list, url): #拼接出最终图片url和path,并存入数组
r = getHTMLText(url)
pic_url = re.findall(r'<a href="/photography/photo_of_the_day/(.*)" title="每日一图:.*">', r)
for i in range(len(pic_url)):
page_url = 'http://www.nationalgeographic.com.cn/photography/photo_of_the_day/' + pic_url[i]
img_url = getPic(page_url)
r2 = getHTMLText(page_url)
bsObj = BeautifulSoup(r2, 'html.parser')
img_name = bsObj.find('div', {'class':'title'}).text[5:]
img_time = bsObj.find('span', {'class':'time'}).text[10:].replace('.', '')
img_path = 'E:/pic/' + img_time + img_name + '.jpg'
list.append([img_url, img_path])
def getPic(p_url): #通过一级url获取真正的url
r = getHTMLText(p_url)
bsObj = BeautifulSoup(r, 'html.parser')
img_url = bsObj.find('img', {'alt':re.compile('[\u4e00-\u9fa5]')})['src'] #虽然获取到了正确链接,但存疑
return img_url
def main():
list = []
url = 'http://www.nationalgeographic.com.cn/photography/photo_of_the_day/'
getList(list, url)
for i in range(len(list)):
with open(list[i][1], "wb") as f:
f.write(requests.get(list[i][0]).content)
main()
[Python爬虫]5.国家地理v3.0
最新推荐文章于 2021-02-09 15:32:36 发布