这个是进阶版本, 更新了api, 更新了多线程. 更新了异常处理.
# -*- coding: utf-8 -*-
import os
import urllib2
import json
import threading
import Queue
import urllib
import socket
class getImg(threading.Thread):
def __init__(self, queue, path): # 进程间通过队列通信,所以每个进程需要用到同一个队列初始化
threading.Thread.__init__(self)
self.queue = queue
self.dir = path
# self.count = 0
# self.setDaemon(True) #守护线程
self.start() # 启动线程
# 使用队列实现进程间通信
def run(self):
socket.setdefaulttimeout(5) # 这里对整个socket层设置超时时间。后续连接中如果再使用到socket,不必再设置
while (True):
global count
global bad
imgurl = self.queue.get()
# print self.getName()
# urllib.urlretrieve(url,filname) 将url的内容提取出来,并存入filename中
try:
count += 1
urllib.urlretrieve(imgurl, self.dir + '%d.jpg'%count)
except Exception as e:
bad+=1
print("-----%s: %s-----\n" % (type(e), imgurl))
if self.queue.empty():
break
self.queue.task_done() # 当使用者线程调用 task_done() 以表示检索了该项目、并完成了所有的工作时,那么未完成的任务的总数就会减少。
def getImgUrls(tag):
#
# return
urls = []
#
# all images
totalNum = -1
#
# pn & rn
# max rn = 60
startNum = 0
resultNum = 30
#
print 'start download theme : ' , tag
#
while (totalNum == -1 or startNum < totalNum) :
#
# one request
oneRequeseNum = 0
#
url = 'http://image.baidu.com/search/acjson?tn=resultjson&ipn=rj&ie=utf-8&oe=utf-8'
url = url + '&word=%s&width=&height=&fr=&pn=%d&rn=%d' % (tag, startNum, resultNum)
print url
#
try:
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {"User-Agent" : user_agent}
req = urllib2.Request(url , headers=headers)
html = urllib2.urlopen(req , timeout=5)
info = html.read()
jsonData = json.loads(info)
#
if (totalNum == -1 or startNum < totalNum):
totalNum = jsonData['listNum']
print 'toatl number :', totalNum
#
data = jsonData['data']
data = data[0:len(data) - 1]
#
for index , item in enumerate(data):
oneRequeseNum += 1
#
if item.has_key("objURL"):
url = item['objURL']
urls.append(url);
#
except Exception , e:
print "Exception : " , str(e)
print url
# this page has error, next page
startNum = startNum + resultNum
#
finally:
startNum = startNum + oneRequeseNum
print 'Finish download theme : ' , tag
print 'Download images number :' , startNum
#
return urls
tags = ['运动服']
tag = tags[0]
urls = getImgUrls(tag)
path = unicode('./' + tag + '/' , 'utf8')
if not os.path.exists(path):
os.makedirs(path)
threads = []
count = 0
bad = 0
queue = Queue.Queue()
# 将所有任务加入队列
for i in range(len(urls)):
queue.put(urls[i])
# 多线程爬去图片
for i in range(10):
thread = getImg(queue, path)
threads.append(thread)
ff = open('urls.txt', 'w');
for url in urls:
ff.write('%s\n' % url)
ff.close()
over.
参考: http://bitjoy.net/2015/08/13/baidu-image-downloader-python3-pyqt5-eric6-cx_freeze4/
https://github.com/Beeder/BaiduImageDownloader/blob/master/DownloadEngine.py
http://my.oschina.net/yulongjiang/blog/182508