Python批量爬取图片

南湖渔歌

已于 2022-08-09 18:47:14 修改

阅读量739

点赞数

分类专栏： M8-100 Python 文章标签： python 数据分析爬虫

于 2022-08-09 18:15:57 首次发布

本文链接：https://blog.csdn.net/weixin_44976611/article/details/126253280

版权

Python 同时被 2 个专栏收录

103 篇文章 8 订阅

订阅专栏

M8-100

18 篇文章 0 订阅

订阅专栏


'''
爬取图片，并且下载图片
url = 'https://pic.netbian.com/4kmeinv/'
爬取网页：requests
解析网页：beautifulsoup

url = 'https://pic.netbian.com/4kmeinv/'
url = 'https://pic.netbian.com/4kmeinv/index_2.html'
"https://pic.netbian.com/uploads/allimg/220809/101035-16600110352f43.jpg"

'''


import os
import requests
from bs4 import BeautifulSoup

# 获取网页的源代码
def craw_html(url):
    resp = requests.get(url)
    resp.encoding = 'gbk'  #  ISO-8859-1
    print(resp.status_code) # 200 页面没有做任何反扒措施
    html = resp.text
    # print(html)
    return html

# 解析图片的地址
def parse_and_download(html):
    soup = BeautifulSoup(html,'html.parser')
    imgs = soup.find_all('img')
    for img in imgs:
        src = img.get('src') # 或者：img['src]
        if "/uploads/" not in src:
            continue
        '''图片后缀没有添加域名,需要拼接一下'''
        src = f"https://pic.netbian.com{src}"

        # 首先得到图片的本地文件地址
        filename = os.path.basename(src)

        # 当我们从网上下文件、图片的时候,都用wb二进制形式
        with open(f"./美女图片/{filename}",'wb') as f:
            resp_img = requests.get(src)
            print(src)
            f.write(resp_img.content)

if __name__ == '__main__':

    ''' \ 连接符号'''
    urls = [ 'https://pic.netbian.com/4kmeinv/'] +\
          [f'https://pic.netbian.com/4kmeinv/index_{i}.html'
           for i in range(2, 11)
    ]

    for url in urls:
        print("#### 正在爬取：",url)
        html = craw_html(url)
        parse_and_download(html)