接上一篇,因为图片量太大,所以试着用多线程来下载图片不知道会不会快一点,我将每个收藏夹内的所有图片的url放在url_list的列表中,然后将列表中的url分成100份,分配给100个线程同时下载,用切片来分割列表
尝试多线程的时候,发现线程数量不能过多,线程数量过多,程序会报内存错误
兴趣是第一生产力
#-*-coding:utf-8-*-
import sys
import os
import Spider
import requests
import threading
import time
reload(sys)
sys.setdefaultencoding('utf-8')
def downloadImg(url_list,path,start,end):
print('thread %s is running...' % threading.current_thread().name)
ss = requests.session()
thread_name = threading.current_thread().name
img_count = 0
for img_url in url_list[start:end]:
#time.sleep(1)
print "%s正在下载%d/%d张图" % (thread_name,img_count,end - start)
img_content = ss.get(img_url)
name = img_url.split(".")
with open(os.path.join(path,thread_name + "_"+str(img_count) +"."+name[-1]),'wb') as f:
f.write(img_content.content)
img_count = img_count + 1
print('thread %s ended.' % threading.current_thread().name)
def downCollection(collection):
l = Spider.Login()
s = l.login_zhihu("xxxxxxxx@xxx.com","xxxxxxxx")
parser = Spider.parserHtml()
collection = collection.encode('utf-8')
collection = collection.strip('\n')
div_list = parser.getAnswerDiv_collection(collection,s)
print "div_list length %d" % len(div_list)
url_list = parser.getImgURL_collection(div_list)
print "url_list length %d" % len(url_list)
path = os.path.join(os.path.abspath("."),"sou"+collection)
if not os.path.isdir(path):
os.mkdir(path)
download = Spider.Download()
#download.downloadImg(url_list,path)
thread_count = 1
thread = []
interval = len(url_list) / 100
start = 0
end = interval
while end < len(url_list):
t = threading.Thread(target=downloadImg,args=(url_list,path,start,end),name="Thread" + str(thread_count))
thread.append(t)
start = end
end = end + interval
thread_count = thread_count + 1
for t in thread:
t.start()
for t in thread:
t.join()
if __name__ == '__main__':
downCollection("12345678")