2021-7-30 Python-爬虫练手:爬取高质量美女图片

目标网站

  • 目标网站:https://www.souutu.com/
  • 爬取美女栏目:https://www.souutu.com/mbizhi/mmeinv/

先看看网站质量
在这里插入图片描述
一开始我i以为只有封面上的图片,然后发现有的时单张图,有的图片右上角蓝标"套图",并且单张的点开具体页面和有套图的页面的源码还是有些区别的.

思路分析

封面页

先简单请求封面页,获取

url=‘https://www.souutu.com/mbizhi/mmeinv/’
resp=requests.get(url,headers)
print(resp.text)

返回源码,但是文字出现乱码,查看源码头部位置

meta charset=“utf-8”

然后再添加上

resp.encoding=“utf-8”

用bs4确定封面页图片链接
resp_page=bs4.BeautifulSoup(resp.text,"html.parser")
exSoup=resp_page.select('img[ class="imgload"]')
拿到图片链接和名称
hrefs=re.findall(r'lazysrc2x="(.*?) 2x" src="',str(exSoup))
names=re.findall(r'alt="(.*?)手机壁纸',str(exSoup))
保存封面
#下载封面图片
for i in range(len(hrefs)):
    try:
        count += 1
        img = requests.get(hrefs[i])
        with open(f'D:/爬取图片/搜优图/{names[i]}.jpg', mode='wb') as f:
            f.write(img.content)
        time.sleep(0.2)
        print(f'第{count}{names[i]}.jpg已下载完成')
    except:
        print('下载失败!')
print("-------------封面图片全部下载完成!-------------")

子页面(套图)

确定子页面位置
Soup=resp_page.select('a[class="card-img-hover"]')
hrefs=re.findall(r'href="(.*?)"',str(Soup))
获取子页面里图片位置和图片名称
	imgs=requests.get(url=href,headers=headers)
    imgs.encoding='utf-8'
    imgs_page=bs4.BeautifulSoup(imgs.text,"html.parser")
    imgSoup=imgs_page.select('a[class="swipebox"]')
    imgname=re.findall(r'title="(.*?)手机壁纸图片组图',str(imgSoup))
    img=re.findall(r'src="(.*?).210.380.jpg"',str(imgSoup))
保存组图

由于进行遍历时有些无组图的会返回[ ]空列表,影响图片取名,所以对子页面链接进行筛选:

	#区分只有单张图片和有套图的
    if imgname != []:
        print(f'正造下载{imgname[0]}组图')
        for i in range(len(img)):
            try:
                imgi = requests.get(img[i], headers)
                with open(f'D:/爬取图片/搜优图/{imgname[0]}{i}.jpg', mode='wb') as fb:
                    fb.write(imgi.content)
                time.sleep(0.3)
                print(f'{imgname[0]}{i}.jpg下载完成')
            except:
                print('下载失败!')
    #无套图的
    else:
        continue

完整代码

'''
2021-7-30
目标网站:https://www.souutu.com/
爬取美女栏目:https://www.souutu.com/mbizhi/mmeinv/
'''
import re,bs4,requests
import random,time

ualist = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3872.400 QQBrowser/10.8.4455.400",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
    "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
    "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
    "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15",
]
headers = {
            "user-agent":random.choice(ualist)
        }
#先爬取第一页
url='https://www.souutu.com/mbizhi/mmeinv/'
resp=requests.get(url,headers)
resp.encoding="utf-8"
resp_page=bs4.BeautifulSoup(resp.text,"html.parser")
exSoup=resp_page.select('img[ class="imgload"]')
hrefs=re.findall(r'lazysrc2x="(.*?) 2x" src="',str(exSoup))
names=re.findall(r'alt="(.*?)手机壁纸',str(exSoup))
count=0
#下载封面图片
for i in range(len(hrefs)):
    try:
        count += 1
        img = requests.get(hrefs[i])
        with open(f'D:/爬取图片/搜优图/{names[i]}.jpg', mode='wb') as f:
            f.write(img.content)
        time.sleep(0.2)
        print(f'第{count}{names[i]}.jpg已下载完成')
    except:
        print('下载失败!')
print("-------------封面图片全部下载完成!-------------")

#下载套图
Soup=resp_page.select('a[class="card-img-hover"]')
hrefs=re.findall(r'href="(.*?)"',str(Soup))
for href in hrefs:
    headers = {
        "user-agent": random.choice(ualist)
    }
    imgs=requests.get(url=href,headers=headers)
    imgs.encoding='utf-8'
    imgs_page=bs4.BeautifulSoup(imgs.text,"html.parser")
    imgSoup=imgs_page.select('a[class="swipebox"]')
    imgname=re.findall(r'title="(.*?)手机壁纸图片组图',str(imgSoup))
    img=re.findall(r'src="(.*?).210.380.jpg"',str(imgSoup))
    #区分只有单张图片和有套图的
    if imgname != []:
        print(f'正造下载{imgname[0]}组图')
        for i in range(len(img)):
            try:
                imgi = requests.get(img[i], headers)
                with open(f'D:/爬取图片/搜优图/{imgname[0]}{i}.jpg', mode='wb') as fb:
                    fb.write(imgi.content)
                time.sleep(0.3)
                print(f'{imgname[0]}{i}.jpg下载完成')
            except:
                print('下载失败!')
    #无套图的
    else:
        continue
    imgs.close()
resp.close()

运行结果

在这里插入图片描述
在这里插入图片描述

成品欣赏

在这里插入图片描述
我只爬取了第一页,其实可以最外面继续添加一个for循环,爬取多页图片

for n in range(2,11):
	url=f'https://www.souutu.com/mbizhi/mmeinv/index_{n}.html'
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值