利用Python爬虫批量下载百度图库图片

最新推荐文章于 2024-10-13 18:19:53 发布

CRAZY696

最新推荐文章于 2024-10-13 18:19:53 发布

阅读量4.5k

点赞数

分类专栏： Python 文章标签： Python 爬虫百度图库

本文链接：https://blog.csdn.net/CRAZY696/article/details/86763062

版权

Python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

先看看效果如下图

效果如上

接下来将逐步介绍如何编写代码

1. 查看百度图库不同关键词的对应的链接发现“word=”后面跟着一串奇怪的符号一流浪地球为例符号为%E6%B5%81%E6%B5%AA%E5%9C%B0%E7%90%83

所以可以通过导入urllib.parse包查询关键词所对应的符号

接下来去百度图库查看网页源代码，下面的函数可以下载网页源代码

def open_url(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4843.400 QQBrowser/9.7.13021.400'}
    req=urllib.request.Request(url=url,headers=headers)

    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')
    #print(html)
    return html

如上图所示一张图片有很多URL，每个链接去试一下，发现objURL使我们需要的所以

在网页源代码里找图片地址可以如下

photo_link = r'"objURL":"(http://[^"]*.jpg)"'
imgs = re.findall(photo_link, html)

找到的图片链接是以列表格式储存带imgs里

接下来就可以在for循环里逐个下载图片了，

for each in imgs:
                    
    #print(each)
    if each.endswith('.jpg'):
    img = urllib.request.urlopen(each)
    img = img.read()
    with open(file_name + '/'+str(i)+'.jpg','wb')as img_file:
        img_file.write(img)
                
    #print(i)    
    i += 1

import urllib.request
import urllib.parse
import ssl
import re
import pymysql
import cv2
import time
import os

i = 0

ssl._create_default_https_context = ssl._create_unverified_context 


def open_url(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4843.400 QQBrowser/9.7.13021.400'}
    req=urllib.request.Request(url=url,headers=headers)

    '''req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0')'''
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')
    #print(html)
    return html

def get_img(html, file_name, num):
    global i
    os.mkdir(file_name)
    photo_link = r'"objURL":"(http://[^"]*.jpg)"'
    imgs = re.findall(photo_link, html) 
    last_link = r'<a href="(/search/flip[^"]*)" class="n">下一页</a>'
    last = re.findall(last_link, html)
    #print(len(imgs))
    
    try:
        for each in imgs:
            if i > num:
                    print('下载结束！')
                    return '---', -1
                    
                    
            #print(each)
            if each.endswith('.jpg'):
                img = urllib.request.urlopen(each)
                img = img.read()
                with open(file_name + '/'+str(i)+'.jpg','wb')as img_file:
                    img_file.write(img)
                
                #print(i)    
                i += 1
                
                
    except:
    
        #print(last, i)            
        return last, i                   


if __name__=="__main__":
    key_word = input('请输入要下载的图片：')
    num = int(input('请输入要下载图片的数量：'))
    print('下载开始... ...')
    key_words=urllib.parse.quote(key_word)
    baidu = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1548421364447_R&pv=&ic=0&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1548421364449%5E00_1351X629&word='
    url = baidu + key_words
    while True:
    
        try:
            last, a = get_img(open_url(url), key_word, num)
            url = 'http://image.baidu.com' + last[0]
            #print(a)
            if a == -1:
                break
            
            
        except:
            time.sleep(0.5)
            pass

完整代码下载：https://download.csdn.net/download/crazy696/10949151