网络爬虫爬取多张照片

最新推荐文章于 2023-10-06 15:27:20 发布

Dxg_01

最新推荐文章于 2023-10-06 15:27:20 发布

阅读量271

点赞数

分类专栏：学习例子

本文链接：https://blog.csdn.net/weixin_42394925/article/details/118309389

版权

学习例子专栏收录该内容

37 篇文章 0 订阅

订阅专栏

import urllib.request
import urllib.parse
import re
import os

#获得保存下载图片文件夹的路径
def Imgpath(word):
    file_path = os.getcwd()[:-4] + word            #获得当前的文件路径后创建带有关键词的路径
    if not os.path.exists(file_path):              #判断新建路径是否已经存在
        os.makedirs(file_path)                     #不存在，创建文件夹
    else:
        file_path = file_path + '1'                #存在，给文件夹重新命名
        os.makedirs(file_path )                    #创建文件夹
    return file_path

def Imgurl(word):
    rep_list = []
    #模拟浏览器，需要用到浏览器的信息和目标url
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
        'access-control-allow-origin':'*',
        'content-type':'image/webp',
        'accept-language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    }
    #将中文关键字加密成浏览器能识别的乱码
    content= urllib.parse.quote(word,encoding='utf-8')
    #依据pn的规律从30到121循环4次，间隔为30
    for num in range(30,121,30):
        gsm = hex(num)[2:]         #将十进制数num转换成16进制数并取后两位
        url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord='+content+'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word='+content+'&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn='+str(num)+'&rn=30&gsm='+ gsm +'&1521707235798='    #根据规律每次循环生成正确的请求地址
        req = urllib.request.Request(url=url,headers=header)   #获得请求对象
        page = urllib.request.urlopen(req).read()     #请求并读取返回信息
        try:              #如果返回信息遇到不在utf-8范围内的字符，跳过
            response = page.decode('utf-8')                       #解码返回的信息
            imgpattern = re.compile(r'"thumbURL":"(.*?)\.jpg')    #编写正则
            rsp_data = re.findall(imgpattern, response)           #通过正则匹配
            rep_list += rsp_data
        except UnicodeDecodeError:
            pass
    return rep_list

def download_img(word):
    # 下载图片
    x = 1  # 计数
    img_urllist = Imgurl(word)
    img_path = Imgpath(word)
    for url in img_urllist[:100]:                  #循环提取Imgurl列表中的前100个字符串
        pngurl = url.replace(r'"thumbURL":"', " ")        #获得字符串里面的url
        path = img_path + '\\' + word + str(x) + '.png'   #下载图片的路径
        pngdata = urllib.request.urlopen(pngurl).read()   #下载图片数据
        f = open(path, 'wb')                              #必须用二进制写入
        f.write(pngdata)                                  #下载图片
        f.close()
        x += 1

if __name__ == '__main__':
    word = input("请输入中文关键词：")
    download_img(word)

import tkinter
win = tkinter.Tk()
win.title("百度图片爬虫")
win.geometry("400x200+400+200")
entry= tkinter.Entry(win,width=28) #e就代表输入框这个对象
entry.insert(10,"请输入关键词")
def func1(event):
    entry.delete(0, 20)
#bind   给控件绑定事件
entry.bind("<Button-1>",func1)
#按钮BUTTON
#command 关联函数   注意 = 后加函数名，但是并不用加()
def func2():
    word = entry.get()
    download_img(word)
button = tkinter.Button(win,text="确定",command = func2,
                   width=6,height=1)
#显示出来
entry.place(x=100,y=50)
button.place(x=170,y=100)
win.mainloop()