主要是用了bs4 模块,进行了爬取。
纯自学,有很多不足的还请指正。
# -*- coding:utf-8 -*-
# 爬取妹子图
# url:https://www.zhaimankan.com/mianfei/meizitu/page/1
import requests
from bs4 import BeautifulSoup
import os
def geturl(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/78.0.3904.108 Safari/537.36"
}
r = requests.get(url=url, headers=headers, timeout=10)
r.encoding = 'utf-8'
# print(r.status_code) 检测响应值
if r.status_code == 200:
return r.text
else:
print('地址连接不成功。错误码{}'.format(r.status_code))
def save_jpg (main_title,main_url):
path = "F:\\妹子图\\{}".format(main_title)
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
print('文件夹创建完成%s' % path)
else:
print("文件夹已经存在")
soup = BeautifulSoup(geturl(main_url), 'lxml')
img_url_list = soup.find_all('img', attrs={'loading': 'lazy'})
count = 1
for img_url in img_url_list:
img_src = img_url['src']
p = requests.get(img_src)
with open(path+'\\%s' % str(count)+'.jpg', 'wb')as f:
f.write(p.content)
f.close()
count = count + 1
print('图片保存完成。')
def allurl():
page = int(input('请输入你所需要爬取的页数:\n'))
for i in range(0,page):
url = 'https://www.zhaimankan.com/mianfei/meizitu/page/{}'.format(i)
# print(geturl(url))
s = BeautifulSoup(geturl(url),'lxml')
i_list =s.find_all('a',attrs={'class':'meta-title'})
# print(i_list)
for a in i_list:
main_url = a['href'] #获取图片链接
main_title = a.string[5:-8] #获取标题
save_jpg (main_title,main_url)
if __name__ == '__main__':
allurl()