import requests
from bs4 import BeautifulSoup
import os
def get_page_urls(url):
headers = {
'Content-Language': 'zh-CN.zh;q=0.9',
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
'User - Agent': 'Mozilla / 5.0(Macintos ;Intel Mac OS X 10_11_6) AppleWebKit / 537.36(KHTML, lik Gecko) Chrome / 67.0.3396.99Safari / 537.36'
}
page_urls = []
# 添加头部,修改解码,获得网页内容
html = requests.get(url, headers).content.decode('utf-8')
# print(html)
soup = BeautifulSoup(html, 'html.parser')
# results = soup.find_all('img')
results = soup.select('.panel-body .img_single a')
for result in results:
# 获得页码路径,添加
page_urls.append(result.attrs['href'])
# print(page_urls)
return page_urls
# print(results)
def get_img_urls(page_urls):
img_urls = []
# print(page_urls)
for page_url in page_urls:
# 在详情图片页面获得内容
html = requests.get(page_url).text
soup = BeautifulSoup(html, 'html.parser')
# 获得详细图片路径
results = soup.select('.image-wrapper img')
# print(results)
for result in results:
# 添加图片路径
img_urls.append(result.attrs['src'])
# print(img_urls)
return img_urls
def save_img(img_urls):
# print(img_urls)
for img_url in img_urls:
# 获得图片内容
img = requests.get(img_url).content
# print(img_url)
# 根据相应的图片名字更改分离条件
file_name = img_url.split('https://')[1].replace('/', '_')
print(file_name)
with open(file_name, 'wb') as f:
f.write(img)
if __name__ == '__main__':
os.makedirs('meizi', exist_ok=True)
os.chdir('meizi')
# page = 0
# 自动翻页
for page in range(250):
url = 'https://www.dbmeinv.com/?pager_offset=' + str(page)
# html = get_html(url)
# img_urls = get_img_urls(html)
# save_img(img_urls)
# 获得页码链接
page_urls = get_page_urls(url)
# 获得图片链接
img_urls = get_img_urls(page_urls)
# 保存图片
save_img(img_urls)
爬取妹子图小程序
最新推荐文章于 2024-02-28 10:08:37 发布