# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/23 17:53
# author:Sabo
"""
爬取优美图库 -> 图说天下 -> 性感美女专栏的所有图片
说明:程序运行后会将图片保存在当前目录下的umeiPictures文件夹中,无需提前新建文件夹,没有会自动创建文件夹
"""
import requests
from bs4 import BeautifulSoup as BS
import os
totalDirName = "umeiPictures"
header = {
"Connection": "close"
}
#
roorUrlLists = ["https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv.htm",
"https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_2.htm",
"https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_3.htm",
"https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_4.htm",
"https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_5.htm",
"https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_6.htm"]
rootUrlHead = "https://www.umei.net/"
def getUrls(rootUrl):
'''
参数:优美图库的根目录
功能:从根目录获取当前页的其他图片的子目录的网址
返回参数:子目录网址的列表
:param rootUrl:
:return:
'''
urlsList = []
response = requests.get(rootUrl)
if response.status_code != 200:
print("Get list error!")
return urlsList
response.encoding = 'utf-8'
txt = response.text
mainPage = BS(txt, 'html.parser')
childPages = mainPage.find('div', attrs={"class": "TypeList"}).find_all("a")
# childPages = childPages._find_all('a')
for index in range(childPages.__len__()):
urlsList.append(rootUrlHead + childPages[index].get("href"))
# print(childPages)
# print(urlsList)
return urlsList
def getTitle(downloadUrl):
response = requests.get(downloadUrl)
if response.status_code != 200:
return None
response.encoding = 'utf-8'
mainPage = BS(response.text, 'html.parser')
childPage = mainPage.find("div", attrs={"class": "ArticleTitle"})
title = childPage.find("strong").text
return title
def checkDir(dirName):
"""
参数:文件加名称
功能:在umeiPictures文件夹下检查是否存在dirName名称的文件夹存在
返回参数:无 (不存在则直接创建)
:param dirName:
:return:
"""
dirs = os.listdir("./" + totalDirName)
if dirName not in dirs:
os.makedirs("./" + totalDirName + "/" + dirName)
def checkPictureExist(dirName, picName):
"""
参数:文件夹名称和文件夹里的图片名称
功能:在umeiPictures文件夹的picName文件夹里查看picName的图片是否存在
返回参数:存在返回True,不存在返回False
:param dirName:
:param picName:
:return:
"""
dirs = os.listdir("./" + totalDirName+"/"+dirName)
if picName in dirs:
print("%s already existed!" % (picName))
return True
return False
def getPictures(downloadUrl, titleStr, indexStr, dirName):
'''
参数:单个系列的根网址,
功能:进行相关图片的下载
返回参数:无
:param webSite:
:return:
'''
response = requests.get(downloadUrl, header, timeout=10)
if response.status_code != 200:
print("Get pictures error!")
return None
response.encoding = 'utf-8'
picMainPage = BS(response.text, 'html.parser')
picChildPage = picMainPage.find('div', attrs={"class": "ImageBody"})
p = picChildPage.find("p", align="center")
img = p.find("img")
src = img.get("src")
# print(src)
# 下载图片
imgDownloadResponse = requests.get(src)
if imgDownloadResponse.status_code != 200:
print("Download pictures error!")
return None
imgDownloadResponse.encoding = 'utf-8'
# img_name = src.split("/")[-1] # url中的最后一个/以后的内容
img_name = titleStr + "_" + indexStr + ".jpg"
with open(totalDirName+ "/" + dirName + "/" + img_name, mode="wb") as f:
f.write(imgDownloadResponse.content) # 图片内容写入到文件
print("over!", img_name)
return True
def checkMainDir():
"""
:return:
"""
dirs = os.listdir("./")
if totalDirName not in dirs:
os.mkdir(totalDirName)
print("Now is creating dir:{0}!".format(totalDirName))
return False
else:
print("{0} is already exist!".format(totalDirName))
return True
if __name__ == '__main__':
downloadFlag = True
maxWebLength = 20
webLists = []
pictureAlreadyExistFlag = False
checkMainDir()
for rootIndex in range(roorUrlLists.__len__()):
urlLists = getUrls(rootUrl=roorUrlLists[rootIndex])
# 遍历父辈网址
for parentUrlListIndex in range(urlLists.__len__()):
downloadFlag = True
# 获取每个标题
title = getTitle(downloadUrl=urlLists[parentUrlListIndex])
# 为每个章节建立单独的文件夹
checkDir(dirName=title)
# 利用父辈标签生成各个子页面
webLists.clear()
webLists.append(urlLists[parentUrlListIndex])
# 生成所有的子页面
for index in range(2, maxWebLength):
webLists.append(urlLists[parentUrlListIndex].replace(".htm", "_" + index.__str__() + ".htm"))
# 利用子页面完成图片的下载
for index in range(webLists.__len__()):
# 检查该图片是否已经下载
pictureAlreadyExistFlag = checkPictureExist(dirName=title,
picName=title + "_" + (index + 1).__str__() + ".jpg")
if downloadFlag is True and pictureAlreadyExistFlag is False:
downloadAns = getPictures(downloadUrl=webLists[index], indexStr=(index + 1).__str__(),
titleStr=title, dirName=title)
if downloadAns is not True:
downloadFlag = False
break
Python 爬取优美图库图片
最新推荐文章于 2022-01-12 13:53:34 发布