1.确定目标
目标网站:https://www.hexuexiao.cn/
爬取目标分类:https://www.hexuexiao.cn/meinv/meinvmingxing/
看看网站质量
质量还不错,开始run
2.提取链接
上面的截图只是主页面封面截图,每个里面还有二级子页面(组图)
所有链接均为bs4结合re正则表达式定位找到
爬取封面链接
#一级页面
exSoup=bs4.BeautifulSoup(resp.text,"html.parser")
onePage=exSoup.select('div[class="waterfall_1box"]')
one_hrefs=re.findall(r'<dd><a href="(.*?)">',str(onePage))
oneNames=re.findall(r'<img alt="(.*?)" pre_height="',str(onePage))
爬取子页面
#子页面
for two in range(len(one_hrefs)):
headers = {
"user-agent": random.choice(ualist)
}
twoResp=requests.get(url=one_hrefs[two],headers=headers)
soup=bs4.BeautifulSoup(twoResp.text,'html.parser')
twoPage=soup.select("div[class='swiper-slide']")
pagename=re.findall(r'<title>(.*?)</title>',str(soup))
imgs=re.findall(r'<img src="(.*?)"/></a>',str(twoPage))
3.代码及结果
完整代码
''''
2021-7-31
目标网站:https://www.hexuexiao.cn/
爬取目标分类:https://www.hexuexiao.cn/meinv/meinvmingxing/
'''
import re,bs4,requests
import random,time
ualist = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3872.400 QQBrowser/10.8.4455.400",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15",
]
headers = {
"user-agent": random.choice(ualist)
}
for i in range(1,21):
url=f'https://www.hexuexiao.cn/meinv/meinvmingxing/list-{i}.html'
resp=requests.get(url=url,headers=headers)
#一级页面
exSoup=bs4.BeautifulSoup(resp.text,"html.parser")
onePage=exSoup.select('div[class="waterfall_1box"]')
one_hrefs=re.findall(r'<dd><a href="(.*?)">',str(onePage))
oneNames=re.findall(r'<img alt="(.*?)" pre_height="',str(onePage))
#子页面
for two in range(len(one_hrefs)):
headers = {
"user-agent": random.choice(ualist)
}
twoResp=requests.get(url=one_hrefs[two],headers=headers)
soup=bs4.BeautifulSoup(twoResp.text,'html.parser')
twoPage=soup.select("div[class='swiper-slide']")
pagename=re.findall(r'<title>(.*?)</title>',str(soup))
imgs=re.findall(r'<img src="(.*?)"/></a>',str(twoPage))
print("\033[0;31m%s\033[0m" % f"-------------------{pagename[0]}-------------------")
#保存图片
for link in range(len(twoPage)):
try:
img = requests.get(imgs[link], headers=headers)
with open(f'D:/爬取图片/靓丽图库/{pagename[0]}{link}.jpg', mode='wb') as f:
f.write(img.content)
print(f'{pagename[0]}{link}下载完毕')
time.sleep(0.1)
except:
print('下载失败!')
twoResp.close()
print("--------------------oner!-------------over!------------over!--------------------------")
print(f"--------------------{url}下载完毕-------------------")
print("--------------------oner!-------------over!------------over!--------------------------")
resp.close()
输出结果
代码细节解释
设置简单反爬
- 从ualist池中随机选取一个浏览器请求头
headers = {
"user-agent": random.choice(ualist)
}
- 设置睡眠时间,防止电脑频率过快被强迫关闭
-time.sleep(0.1)
这个时间自己随意,由于我一次性爬了20个页面,为求快,所以只设置了0.1s
打印输出结果
- 方便看清程序运行
-由于所爬取页面过多,打印的信息需要容易区分:
print("\033[0;31m%s\033[0m" % f"-------------------{pagename[0]}-------------------")
这样可以清晰区分每一个封面所对应的所有图片,效果如下:
- 区分每一个页面
print("--------------------oner!-------------over!------------over!--------------------------")
print(f"--------------------{url}下载完毕-------------------")
print("--------------------oner!-------------over!------------over!--------------------------")
- 防止出错即停止程序
try:
img = requests.get(imgs[link], headers=headers)
with open(f'D:/爬取图片/靓丽图库/{pagename[0]}{link}.jpg', mode='wb') as f:
f.write(img.content)
print(f'{pagename[0]}{link}下载完毕')
time.sleep(0.1)
except:
print('下载失败!')
用try-except 方法
防止因为某页或者某张图片命名或者未选对链接出现错误导致整个程序停止
- 关闭请求头
twoResp.close()
...
resp.close()
4.成品欣赏
爬了20页,近2500张美女明星图