如何在百度下载图片? 2

最新推荐文章于 2024-06-05 08:22:49 发布

lcbwlx

最新推荐文章于 2024-06-05 08:22:49 发布

阅读量562

点赞数

分类专栏：网页

本文链接：https://blog.csdn.net/lcbwlx/article/details/48895499

版权

网页专栏收录该内容

7 篇文章 0 订阅

订阅专栏

这个是进阶版本, 更新了api, 更新了多线程. 更新了异常处理.

# -*- coding: utf-8 -*-



import os
import urllib2
import json

import threading
import Queue
import urllib

import socket

class getImg(threading.Thread):
    def __init__(self, queue, path):  # 进程间通过队列通信，所以每个进程需要用到同一个队列初始化
        threading.Thread.__init__(self) 
        self.queue = queue
        self.dir = path
        # self.count = 0
        # self.setDaemon(True)         #守护线程
        self.start()  # 启动线程
     
    # 使用队列实现进程间通信
    def run(self):
        
        socket.setdefaulttimeout(5)  # 这里对整个socket层设置超时时间。后续连接中如果再使用到socket，不必再设置 
        
        while (True):
            global count
            global bad
            
            imgurl = self.queue.get()
            # print self.getName()
 
            # urllib.urlretrieve(url,filname) 将url的内容提取出来，并存入filename中 
            try:
                count += 1
                urllib.urlretrieve(imgurl, self.dir + '%d.jpg'%count)
            except Exception as e:
                bad+=1
                print("-----%s: %s-----\n" % (type(e), imgurl))
            
            if self.queue.empty():
                break
            self.queue.task_done()  # 当使用者线程调用 task_done() 以表示检索了该项目、并完成了所有的工作时，那么未完成的任务的总数就会减少。



def getImgUrls(tag):
    #
    # return
    urls = []
    #
    # all images
    totalNum = -1
    #
    # pn & rn
    # max rn = 60
    startNum = 0
    resultNum = 30
    #
    print 'start download theme : ' , tag
    #
    while (totalNum == -1 or startNum < totalNum) :
       # 
        # one request
        oneRequeseNum = 0
        #
        url = 'http://image.baidu.com/search/acjson?tn=resultjson&ipn=rj&ie=utf-8&oe=utf-8'
        url = url + '&word=%s&width=&height=&fr=&pn=%d&rn=%d' % (tag, startNum, resultNum)
        print url
        #
        try:
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = {"User-Agent" : user_agent}
            req = urllib2.Request(url , headers=headers)
            html = urllib2.urlopen(req , timeout=5)
            info = html.read()
            jsonData = json.loads(info)
            #
            if (totalNum == -1 or startNum < totalNum):
                totalNum = jsonData['listNum']
                print 'toatl number :', totalNum
            #
            data = jsonData['data']
            data = data[0:len(data) - 1]
            #
            for index , item in enumerate(data):
                oneRequeseNum += 1
               # 
                if item.has_key("objURL"):
                    url = item['objURL']
                    urls.append(url);
        #
        except Exception , e:
            print "Exception : " , str(e)
            print url
            # this page has error, next page
            startNum = startNum + resultNum
        #
        finally:
            startNum = startNum + oneRequeseNum
            print 'Finish download theme : ' , tag
            print 'Download images number :' , startNum
    #
    return urls



tags = ['运动服']

tag = tags[0]

urls = getImgUrls(tag)
        
path = unicode('./' + tag + '/' , 'utf8')
if not os.path.exists(path):
    os.makedirs(path)

threads = []
count = 0
bad = 0
queue = Queue.Queue()

# 将所有任务加入队列
for i in range(len(urls)):
    queue.put(urls[i])
    
# 多线程爬去图片
for i in range(10):
    thread = getImg(queue, path)
    threads.append(thread)

ff = open('urls.txt', 'w');
for url in urls:
    ff.write('%s\n' % url)
ff.close()

over.

参考: http://bitjoy.net/2015/08/13/baidu-image-downloader-python3-pyqt5-eric6-cx_freeze4/

https://github.com/Beeder/BaiduImageDownloader/blob/master/DownloadEngine.py

http://my.oschina.net/yulongjiang/blog/182508