python爬虫（最新的可以爬的妹子图）

weixin_42325105

已于 2022-12-13 10:34:40 修改

阅读量5.4k

点赞数

分类专栏： python学习文章标签： python 学习爬虫

于 2022-12-13 10:31:33 首次发布

本文链接：https://blog.csdn.net/weixin_42325105/article/details/128297507

版权

python学习专栏收录该内容

1 篇文章

订阅专栏

python爬虫（最新的可以爬的妹子图）

入门了python后，尤其爬虫技术，总要找找成就感，搜索站里很多文章，发现站上很多原来的meizitu或mzitu这样的资源站已经访问不了，经过搜索之后，找到一个比较好的资源，在站里现有博主的代码的基础上，结合新站的html特点和架构分析，编写了最新的可以爬的妹子图爬虫代码，分享给需要的小伙伴的，也激励大家在python学习的道路上继续前进。

运行图片

多余的细节不用讲了，站里现有的博主已经讲得够透，不在赘述，直接上图:
爬虫运行图片
最新的可以爬取的妹子图网站为https://meirentu.cc

完整代码（类似的代码分析文章站里很多，难度也不高，在此就不做细致分析了）

爬虫完成 代码片.

# 1.导入requests和BeautifulSoup库
import requests
from bs4 import BeautifulSoup

# 2.设置爬取下来图片的存储位置，找到图片的下载地址
root = "E://p_images//"  # 存放路径

mainPage = f'https://meirentu.cc'  # 主页
try:
    r = requests.get(mainPage)  # 爬取网页信息
    src = r.text
    soup = BeautifulSoup(src, 'html.parser')

    # 这里要注意的是，因为python中有class这个关键字，所以不能直接用class，要用class_
    soup_new = soup.find('ul', class_='sub-menu')  # 找到class=list的div标签
    for li in soup_new.find_all('li'):  # 查看图片
        # 获取第一个跳转页面地址
        page = f"https://meirentu.cc{li.a['href']}"  # 获取第一个跳转页面的地址
        print(page)
        n = 1  # 标记
        try:
            r = requests.get(page, timeout=(6.05, 12.05))  # 爬取网页信息
            src = r.text
            soup = BeautifulSoup(src, 'html.parser')

            # 这里要注意的是，因为python中有class这个关键字，所以不能直接用class，要用class_
            soup_new = soup.find('ul', class_='update_area_lists cl')  # 找到class=list的div标签
            for li in soup_new.find_all('li'):  # 查看图片
                # flag = li.find_all('div')  # 判断li标签下是否有div标签

                # 有div标签的话说明不是我们想要的，直接跳过（具体情况要视网站而定，此情况仅适用于该网站）
                # print(flag)
                # if flag:
                #     continue
                # 获取第一个跳转页面地址
                page = f"https://meirentu.cc{li.a['href']}"  # 获取第一个跳转页面的地址
                # print(f"page:{li.a['href']}")
                try:
                    r1 = requests.get(page, timeout=(6.05, 12.05))  # 获取网页信息
                    src1 = r1.text
                    soup1 = BeautifulSoup(src1, 'html.parser')
                    soup_newd = soup1.find('div', class_='page')  # 找到该网页class为pic的div标签
                    for p in soup_newd.find_all('a'):  # 找到最大分辨率的图片
                        # print(f"downpage:{p.a['href']}")
                        downPage = f"https://meirentu.cc{p['href']}"  # 获取图片下载地址
                        try:
                            r2 = requests.get(downPage, timeout=(6.05, 12.05))
                            src2 = r2.text
                            soup2 = BeautifulSoup(src2, 'html.parser')
                            soup_d = soup2.find('div', class_='content_left')  # 找到id为endimg的table标签
                            for td in soup_d.find_all('div'):  # 下载图片
                                if td.img is None:
                                    continue
                                imageUrl = f"{td.img['src']}"  # 获取图片下载地址
                                headers = {
                                    "authority": "cdn3.mmdb.cc",
                                    "method": "GET",
                                    "scheme": "https",
                                    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                                    "accept-encoding": "gzip, deflate, br",
                                    "accept-language": "zh-CN,zh;q=0.9",
                                    "cache-control": "max-age=0",
                                    "if-modified-since": "Wed, 07 Dec 2022 00:49:44 GMT",
                                    "if-none-match": "638fe0de-1d4dd",
                                    "referer": page,
                                    "sec - ch - ua - platform": "Windows",
                                    "sec - fetch - dest": "document",
                                    "sec - fetch - mode": "navigate",
                                    "sec - fetch - site": "cross-site",
                                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
                                }
                                print(f"正在下载：{imageUrl.split('/')[-1]}")
                                try:
                                    r3 = requests.get(imageUrl, headers=headers, timeout=(6.05, 12.05))
                                    image = r3.content  # 以2进制形式保存图片信息
                                    with open(root + imageUrl.split('/')[-1], 'wb') as f:  # 打开文件夹
                                        f.write(image)  # 保存图片
                                        print("已下载" + str(n) + "张图片")
                                        n += 1  # 标记
                                except requests.exceptions.RequestException as e:
                                    print(e)
                        except requests.exceptions.RequestException as e:
                            print(e)
                except requests.exceptions.RequestException as e:
                    print(e)
        except requests.exceptions.RequestException as e:
            print(e)
except requests.exceptions.RequestException as e:
    print(e)