目标网站
- 目标网站:https://www.souutu.com/
- 爬取美女栏目:https://www.souutu.com/mbizhi/mmeinv/
先看看网站质量
一开始我i以为只有封面上的图片,然后发现有的时单张图,有的图片右上角蓝标"套图",并且单张的点开具体页面和有套图的页面的源码还是有些区别的.
思路分析
封面页
先简单请求封面页,获取
url=‘https://www.souutu.com/mbizhi/mmeinv/’
resp=requests.get(url,headers)
print(resp.text)
返回源码,但是文字出现乱码,查看源码头部位置
meta charset=“utf-8”
然后再添加上
resp.encoding=“utf-8”
用bs4确定封面页图片链接
resp_page=bs4.BeautifulSoup(resp.text,"html.parser")
exSoup=resp_page.select('img[ class="imgload"]')
拿到图片链接和名称
hrefs=re.findall(r'lazysrc2x="(.*?) 2x" src="',str(exSoup))
names=re.findall(r'alt="(.*?)手机壁纸',str(exSoup))
保存封面
#下载封面图片
for i in range(len(hrefs)):
try:
count += 1
img = requests.get(hrefs[i])
with open(f'D:/爬取图片/搜优图/{names[i]}.jpg', mode='wb') as f:
f.write(img.content)
time.sleep(0.2)
print(f'第{count}张{names[i]}.jpg已下载完成')
except:
print('下载失败!')
print("-------------封面图片全部下载完成!-------------")
子页面(套图)
确定子页面位置
Soup=resp_page.select('a[class="card-img-hover"]')
hrefs=re.findall(r'href="(.*?)"',str(Soup))
获取子页面里图片位置和图片名称
imgs=requests.get(url=href,headers=headers)
imgs.encoding='utf-8'
imgs_page=bs4.BeautifulSoup(imgs.text,"html.parser")
imgSoup=imgs_page.select('a[class="swipebox"]')
imgname=re.findall(r'title="(.*?)手机壁纸图片组图',str(imgSoup))
img=re.findall(r'src="(.*?).210.380.jpg"',str(imgSoup))
保存组图
由于进行遍历时有些无组图的会返回[ ]空列表,影响图片取名,所以对子页面链接进行筛选:
#区分只有单张图片和有套图的
if imgname != []:
print(f'正造下载{imgname[0]}组图')
for i in range(len(img)):
try:
imgi = requests.get(img[i], headers)
with open(f'D:/爬取图片/搜优图/{imgname[0]}{i}.jpg', mode='wb') as fb:
fb.write(imgi.content)
time.sleep(0.3)
print(f'{imgname[0]}{i}.jpg下载完成')
except:
print('下载失败!')
#无套图的
else:
continue
完整代码
'''
2021-7-30
目标网站:https://www.souutu.com/
爬取美女栏目:https://www.souutu.com/mbizhi/mmeinv/
'''
import re,bs4,requests
import random,time
ualist = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3872.400 QQBrowser/10.8.4455.400",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15",
]
headers = {
"user-agent":random.choice(ualist)
}
#先爬取第一页
url='https://www.souutu.com/mbizhi/mmeinv/'
resp=requests.get(url,headers)
resp.encoding="utf-8"
resp_page=bs4.BeautifulSoup(resp.text,"html.parser")
exSoup=resp_page.select('img[ class="imgload"]')
hrefs=re.findall(r'lazysrc2x="(.*?) 2x" src="',str(exSoup))
names=re.findall(r'alt="(.*?)手机壁纸',str(exSoup))
count=0
#下载封面图片
for i in range(len(hrefs)):
try:
count += 1
img = requests.get(hrefs[i])
with open(f'D:/爬取图片/搜优图/{names[i]}.jpg', mode='wb') as f:
f.write(img.content)
time.sleep(0.2)
print(f'第{count}张{names[i]}.jpg已下载完成')
except:
print('下载失败!')
print("-------------封面图片全部下载完成!-------------")
#下载套图
Soup=resp_page.select('a[class="card-img-hover"]')
hrefs=re.findall(r'href="(.*?)"',str(Soup))
for href in hrefs:
headers = {
"user-agent": random.choice(ualist)
}
imgs=requests.get(url=href,headers=headers)
imgs.encoding='utf-8'
imgs_page=bs4.BeautifulSoup(imgs.text,"html.parser")
imgSoup=imgs_page.select('a[class="swipebox"]')
imgname=re.findall(r'title="(.*?)手机壁纸图片组图',str(imgSoup))
img=re.findall(r'src="(.*?).210.380.jpg"',str(imgSoup))
#区分只有单张图片和有套图的
if imgname != []:
print(f'正造下载{imgname[0]}组图')
for i in range(len(img)):
try:
imgi = requests.get(img[i], headers)
with open(f'D:/爬取图片/搜优图/{imgname[0]}{i}.jpg', mode='wb') as fb:
fb.write(imgi.content)
time.sleep(0.3)
print(f'{imgname[0]}{i}.jpg下载完成')
except:
print('下载失败!')
#无套图的
else:
continue
imgs.close()
resp.close()
运行结果
成品欣赏
我只爬取了第一页,其实可以最外面继续添加一个for循环,爬取多页图片
for n in range(2,11):
url=f'https://www.souutu.com/mbizhi/mmeinv/index_{n}.html'