Python 爬取优美图库图片

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/7/23 17:53
# author:Sabo

"""
    爬取优美图库 -> 图说天下 -> 性感美女专栏的所有图片
    说明:程序运行后会将图片保存在当前目录下的umeiPictures文件夹中,无需提前新建文件夹,没有会自动创建文件夹
"""

import requests
from bs4 import BeautifulSoup as BS
import os

totalDirName = "umeiPictures"
header = {
    "Connection": "close"
}
#
roorUrlLists = ["https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv.htm",
                "https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_2.htm",
                "https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_3.htm",
                "https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_4.htm",
                "https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_5.htm",
                "https://www.umei.net/bizhitupian/meinvbizhi/xingganmeinv_6.htm"]
rootUrlHead = "https://www.umei.net/"


def getUrls(rootUrl):
    '''
    参数:优美图库的根目录
    功能:从根目录获取当前页的其他图片的子目录的网址
    返回参数:子目录网址的列表
    :param rootUrl:
    :return:
    '''
    urlsList = []
    response = requests.get(rootUrl)
    if response.status_code != 200:
        print("Get list error!")
        return urlsList
    response.encoding = 'utf-8'
    txt = response.text
    mainPage = BS(txt, 'html.parser')
    childPages = mainPage.find('div', attrs={"class": "TypeList"}).find_all("a")
    # childPages = childPages._find_all('a')
    for index in range(childPages.__len__()):
        urlsList.append(rootUrlHead + childPages[index].get("href"))
    # print(childPages)
    # print(urlsList)
    return urlsList


def getTitle(downloadUrl):
    response = requests.get(downloadUrl)
    if response.status_code != 200:
        return None
    response.encoding = 'utf-8'
    mainPage = BS(response.text, 'html.parser')
    childPage = mainPage.find("div", attrs={"class": "ArticleTitle"})
    title = childPage.find("strong").text
    return title


def checkDir(dirName):
    """
    参数:文件加名称
    功能:在umeiPictures文件夹下检查是否存在dirName名称的文件夹存在
    返回参数:无 (不存在则直接创建)
    :param dirName:
    :return:
    """

    dirs = os.listdir("./" + totalDirName)
    if dirName not in dirs:
        os.makedirs("./" + totalDirName + "/" + dirName)


def checkPictureExist(dirName, picName):
    """
    参数:文件夹名称和文件夹里的图片名称
    功能:在umeiPictures文件夹的picName文件夹里查看picName的图片是否存在
    返回参数:存在返回True,不存在返回False
    :param dirName:
    :param picName:
    :return:
    """
    dirs = os.listdir("./" + totalDirName+"/"+dirName)
    if picName in dirs:
        print("%s already existed!" % (picName))
        return True
    return False


def getPictures(downloadUrl, titleStr, indexStr, dirName):
    '''
    参数:单个系列的根网址,
    功能:进行相关图片的下载
    返回参数:无
    :param webSite:
    :return:
    '''
    response = requests.get(downloadUrl, header, timeout=10)
    if response.status_code != 200:
        print("Get pictures error!")
        return None
    response.encoding = 'utf-8'
    picMainPage = BS(response.text, 'html.parser')
    picChildPage = picMainPage.find('div', attrs={"class": "ImageBody"})
    p = picChildPage.find("p", align="center")
    img = p.find("img")
    src = img.get("src")
    # print(src)
    # 下载图片
    imgDownloadResponse = requests.get(src)
    if imgDownloadResponse.status_code != 200:
        print("Download pictures error!")
        return None
    imgDownloadResponse.encoding = 'utf-8'
    # img_name = src.split("/")[-1]  # url中的最后一个/以后的内容
    img_name = titleStr + "_" + indexStr + ".jpg"
    with open(totalDirName+ "/" + dirName + "/" + img_name, mode="wb") as f:
        f.write(imgDownloadResponse.content)  # 图片内容写入到文件
        print("over!", img_name)
    return True


def checkMainDir():
    """

    :return:
    """
    dirs = os.listdir("./")
    if totalDirName not in dirs:
        os.mkdir(totalDirName)
        print("Now is creating dir:{0}!".format(totalDirName))
        return False
    else:
        print("{0} is already exist!".format(totalDirName))
    return True


if __name__ == '__main__':
    downloadFlag = True
    maxWebLength = 20
    webLists = []
    pictureAlreadyExistFlag = False
    checkMainDir()
    for rootIndex in range(roorUrlLists.__len__()):
        urlLists = getUrls(rootUrl=roorUrlLists[rootIndex])

        # 遍历父辈网址
        for parentUrlListIndex in range(urlLists.__len__()):
            downloadFlag = True
            # 获取每个标题
            title = getTitle(downloadUrl=urlLists[parentUrlListIndex])

            # 为每个章节建立单独的文件夹
            checkDir(dirName=title)

            # 利用父辈标签生成各个子页面
            webLists.clear()
            webLists.append(urlLists[parentUrlListIndex])

            # 生成所有的子页面
            for index in range(2, maxWebLength):
                webLists.append(urlLists[parentUrlListIndex].replace(".htm", "_" + index.__str__() + ".htm"))

            # 利用子页面完成图片的下载
            for index in range(webLists.__len__()):

                # 检查该图片是否已经下载
                pictureAlreadyExistFlag = checkPictureExist(dirName=title,
                                                            picName=title + "_" + (index + 1).__str__() + ".jpg")
                if downloadFlag is True and pictureAlreadyExistFlag is False:
                    downloadAns = getPictures(downloadUrl=webLists[index], indexStr=(index + 1).__str__(),
                                              titleStr=title, dirName=title)
                    if downloadAns is not True:
                        downloadFlag = False
                        break

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值