批量爬取图片资源（以程序中的网址为例）_pycharm怎么批量抓照片-CSDN博客

本文链接：https://blog.csdn.net/qq_36055407/article/details/104933425

直接上代码吧，复制到pycharm中，install相关的库就可以直接运行！

有个问题就是当我爬取了1.2GB多资源的时候，后面的图片就全部重复了不知道是为什么，也许是这个网站的资源有限！

import requests
from bs4 import BeautifulSoup
import os
import urllib.request


############批量爬取手机图片资源###################
# 爬取页面资源
def getPage(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    try:
        r.raise_for_status()
    except:
        print("访问异常：" + r.status_code)
    data = r.text
    # print(data)
    soup = BeautifulSoup(data, 'html.parser')
    # print(soup)
    items = soup.find_all(class_='pic')
    for item in items[:-3]:
        name = item.em.text
        print("图片主题：" + name)
        the_url = 'http://sj.zol.com.cn' + item.get('href')
        print('http://sj.zol.com.cn' + item.get('href'))
        downImg(the_url, "IMG_data/" + name)


# 下载图片：
def downImg(url, path):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    try:
        r.raise_for_status()
    except:
        print("访问异常：" + r.status_code)
    data = r.text
    # print(data)
    soup = BeautifulSoup(data, 'html.parser')
    # 需要爬取的数量
    num = int(str(soup.span.text).split('/')[1].split('）')[0]);
    # print(num)

    # 默认选择第一个清晰度的图片
    type = str(soup.dd.a.get('href'))
    # 拼接高清图片的位置：
    type_0 = type.split('_')[0] + '_'
    type_1 = '_' + type.split('_')[-1]
    # print(soup.dd.a.get('href'))
    # print(type_0)
    # print(type_1)

    for id in range(num):
        the_imgid = 'img' + str((id + 1))
        # print('img'+str((id+1)))
        # 缩略图位置
        the_url = soup.find(id=the_imgid).a.get('href')
        imgId = str(the_url).split('_')[-1].split(".")[0]
        img_url = 'http://sj.zol.com.cn' + type_0 + imgId + type_1
        print(img_url)
        # 判断文件是否存在，如果不存在则进行删除(犹豫碰到了线程阻塞问题，为了避免重复下载，只要该主题的文件夹存在就直接掠过，不爬取)
        if not os.path.exists(path):
            os.mkdir(path)
        else:
            return
        rrr = requests.get(img_url, headers=headers)
        try:
            rrr.raise_for_status()
            ss = BeautifulSoup(rrr.text, 'html.parser')
            aa = ss.img.get('src')
            urllib.request.build_opener().addheaders = headers
            if not os.path.exists(path + '/' + str(id) + '.jpg'):
                urllib.request.urlretrieve(aa, path + '/' + str(id) + '.jpg')
        except:
            print('获取失败！')

    print(path + "，下载完成！")


if __name__ == '__main__':
    # url = ""
    # getPage(url)http://sj.zol.com.cn/bizhi/new_1.html
    for num in range(100):
        url = "http://sj.zol.com.cn/bizhi/new_" + str(num + 1) + ".html"
        getPage(url)