'''
爬取图片,并且下载图片
url = 'https://pic.netbian.com/4kmeinv/'
爬取网页:requests
解析网页:beautifulsoup
url = 'https://pic.netbian.com/4kmeinv/'
url = 'https://pic.netbian.com/4kmeinv/index_2.html'
"https://pic.netbian.com/uploads/allimg/220809/101035-16600110352f43.jpg"
'''
import os
import requests
from bs4 import BeautifulSoup
# 获取网页的源代码
def craw_html(url):
resp = requests.get(url)
resp.encoding = 'gbk' # ISO-8859-1
print(resp.status_code) # 200 页面没有做任何反扒措施
html = resp.text
# print(html)
return html
# 解析图片的地址
def parse_and_download(html):
soup = BeautifulSoup(html,'html.parser')
imgs = soup.find_all('img')
for img in imgs:
src = img.get('src') # 或者:img['src]
if "/uploads/" not in src:
continue
'''图片后缀没有添加域名,需要拼接一下'''
src = f"https://pic.netbian.com{src}"
# 首先得到图片的本地文件地址
filename = os.path.basename(src)
# 当我们从网上下文件、图片的时候,都用wb二进制形式
with open(f"./美女图片/{filename}",'wb') as f:
resp_img = requests.get(src)
print(src)
f.write(resp_img.content)
if __name__ == '__main__':
''' \ 连接符号'''
urls = [ 'https://pic.netbian.com/4kmeinv/'] +\
[f'https://pic.netbian.com/4kmeinv/index_{i}.html'
for i in range(2, 11)
]
for url in urls:
print("#### 正在爬取:",url)
html = craw_html(url)
parse_and_download(html)
Python批量爬取图片
于 2022-08-09 18:15:57 首次发布