python爬虫爬取图片，显示进度条

最新推荐文章于 2024-07-09 02:54:16 发布

吴天德少侠

最新推荐文章于 2024-07-09 02:54:16 发布

阅读量1k

点赞数

本文链接：https://blog.csdn.net/sdhdsf132452/article/details/113405308

版权

python 专栏收录该内容

89 篇文章 3 订阅

订阅专栏

该博客介绍了如何使用Python编程实现百度图片的搜索和下载。通过输入关键词，程序可以获取指定数量的图片链接，并将其保存到本地。过程中涉及了requests库的使用、正则表达式进行HTML解析、以及文件操作等技术。

摘要由CSDN通过智能技术生成

import os
import re
import time
from alive_progress import alive_bar
import requests

headers = {
        "Accept":'text/plain, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
		'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
		'Connection': 'keep-alive',
        'Host': 'image.baidu.com',
        'Referer':'https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E6%B1%BD%E8%BD%A6&step_word=&hs=0&pn=4&spn=0&di=210760&pi=0&rn=1&tn=baiduimagedetail&is=0,0&istype=2&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs=3100769720,1311769304&os=3476023477,1580744637&simid=3386877588,386203947&adpicid=0&lpn=0&ln=1356&fr=&fmq=1611916956498_R&fm=index&ic=0&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=&height=&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=https://gimg2.baidu.com/image_search/src=http://focus123.com.cn/Uploads/images/20171117/1510909072357573.jpg&refer=http://focus123.com.cn&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1614508959&t=4202658381c41e10862a566eb846055f&fromurl=ippr_z2C$qAzdH3FAzdH3Fooo_z&e3Bu5v7f8dn_z&e3BvgAzdH3FgjofAzdH3Ffi5oAzdH3FdAzdH3Fc8lb&gsm=1&rpstart=0&rpnum=0&islist=&querylist=&force=undefined',
        'Sec-Fetch-Dest':'empty',
        'Sec-Fetch-Mode': 'cors',
		'Sec-Fetch-Site': 'same-origin',
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.53",
		'X-Requested-With': 'XMLHttpRequest'
    }

url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word='
keyword = input("图片关键词：")

url=url+keyword+"&pn="
time_start=time.time()#获取初始时间

strhtml=requests.get(url,headers=headers)
string=str(strhtml.text)

totalnum = re.findall('<div id="resultInfo" style="font-size: 13px;">(.*?)</div>', string)
print(totalnum[0])

countmax=eval(input("请输入要爬取的图片数量："))

img_url_regex = '"thumbURL":"(.*?)",'  # 正则匹配式
count = 0  # 总共下载的图片数
index = 0  # 链接后面的序号
page = 0  # 当前搜集的页
while (1):
    strhtml = requests.get(url + str(index), headers=headers)  # get方式获取数据
    string = str(strhtml.text)
    print("已爬取网页")
    pic_url = re.findall(img_url_regex, string)  # 先利用正则表达式找到图片url
    page += 1
    print("这是第"+str(page)+"页")
    # print("第" + str(page) + "页共收集到" + str(len(pic_url)) + "张图片")
    index += len(pic_url)  # 网址索引向后，跳到下一页继续搜刮图片
    print(index)
    try:  # 如果没有文件夹就创建
        os.mkdir('.' + r'\\' + keyword)
    except:
        pass

    if (countmax-count)<=30:
        bar_num = countmax-count
    else:
        bar_num = 30

    with alive_bar(bar_num) as bar:
        for each in pic_url:
            #print('正在下载第' + str(count + 1) + '张图片，图片地址:' + str(each))
            bar()
            try:
                if each is not None:
                    pic = requests.get(each, timeout=5)
                else:
                    continue
            except BaseException:
                print('错误，当前图片无法下载')
                continue
            else:
                string = '.' + r'\\' + keyword + r'\\' + keyword + '_' + str(count + 1) + '.jpg'
                fp = open(string, 'wb')
                fp.write(pic.content)
                fp.close()
                count += 1
            if countmax == count:
                break
    if countmax == count:
        break
time_end = time.time()  # 获取结束时间
print('处理完毕，共耗时%.2f秒: '%(time_end - time_start))

效果：